runmat-runtime 0.5.5

//! MATLAB-compatible `textscan` builtin for formatted text imports.

use std::collections::HashSet;
use std::io::{Read, Seek, SeekFrom};

use encoding_rs::{Encoding, SHIFT_JIS};
use runmat_builtins::{
    BuiltinCompletionPolicy, BuiltinDescriptor, BuiltinErrorDescriptor, BuiltinOutputMode,
    BuiltinParamArity, BuiltinParamDescriptor, BuiltinParamType, BuiltinSignatureDescriptor,
    CellArray, Tensor, Value,
};
use runmat_macros::runtime_builtin;

use crate::builtins::common::spec::{
    BroadcastSemantics, BuiltinFusionSpec, BuiltinGpuSpec, ConstantStrategy, GpuOpKind,
    ReductionNaN, ResidencyPolicy, ShapeRequirements,
};
use crate::builtins::io::filetext::{helpers::decode_bytes, registry};
use crate::{build_runtime_error, gather_if_needed_async, BuiltinResult, RuntimeError};

const BUILTIN_NAME: &str = "textscan";

const TEXTSCAN_OUTPUTS: [BuiltinParamDescriptor; 1] = [BuiltinParamDescriptor {
    name: "C",
    ty: BuiltinParamType::Any,
    arity: BuiltinParamArity::Required,
    default: None,
    description: "Cell array containing one output per conversion, or collected groups.",
}];
const TEXTSCAN_INPUTS_TEXT_FORMAT: [BuiltinParamDescriptor; 2] = [
    BuiltinParamDescriptor {
        name: "textOrFileID",
        ty: BuiltinParamType::Any,
        arity: BuiltinParamArity::Required,
        default: None,
        description: "Input text or file identifier opened by fopen.",
    },
    BuiltinParamDescriptor {
        name: "formatSpec",
        ty: BuiltinParamType::StringScalar,
        arity: BuiltinParamArity::Required,
        default: None,
        description: "Format specification such as '%f %s'.",
    },
];
const TEXTSCAN_INPUTS_TEXT_FORMAT_OPTIONS: [BuiltinParamDescriptor; 3] = [
    BuiltinParamDescriptor {
        name: "textOrFileID",
        ty: BuiltinParamType::Any,
        arity: BuiltinParamArity::Required,
        default: None,
        description: "Input text or file identifier opened by fopen.",
    },
    BuiltinParamDescriptor {
        name: "formatSpec",
        ty: BuiltinParamType::StringScalar,
        arity: BuiltinParamArity::Required,
        default: None,
        description: "Format specification such as '%f %s'.",
    },
    BuiltinParamDescriptor {
        name: "args...",
        ty: BuiltinParamType::Any,
        arity: BuiltinParamArity::Variadic,
        default: None,
        description: "Optional repeat count followed by name-value pairs.",
    },
];
const TEXTSCAN_SIGNATURES: [BuiltinSignatureDescriptor; 2] = [
    BuiltinSignatureDescriptor {
        label: "C = textscan(textOrFileID, formatSpec)",
        inputs: &TEXTSCAN_INPUTS_TEXT_FORMAT,
        outputs: &TEXTSCAN_OUTPUTS,
    },
    BuiltinSignatureDescriptor {
        label: "C = textscan(textOrFileID, formatSpec, args...)",
        inputs: &TEXTSCAN_INPUTS_TEXT_FORMAT_OPTIONS,
        outputs: &TEXTSCAN_OUTPUTS,
    },
];

const TEXTSCAN_ERROR_ARGUMENT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
    code: "RM.TEXTSCAN.ARGUMENT",
    identifier: Some("RunMat:textscan:InvalidArgument"),
    when: "Input, format specification, repeat count, or name-value options are malformed.",
    message: "textscan: invalid argument",
};
const TEXTSCAN_ERROR_FORMAT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
    code: "RM.TEXTSCAN.FORMAT",
    identifier: Some("RunMat:textscan:InvalidFormat"),
    when: "Format specification cannot be parsed or contains unsupported conversions.",
    message: "textscan: invalid format specification",
};
const TEXTSCAN_ERROR_FILE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
    code: "RM.TEXTSCAN.FILE",
    identifier: Some("RunMat:textscan:File"),
    when: "A file identifier is invalid or cannot be read.",
    message: "textscan: file read failed",
};
const TEXTSCAN_ERROR_PARSE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
    code: "RM.TEXTSCAN.PARSE",
    identifier: Some("RunMat:textscan:Parse"),
    when: "Input text cannot be parsed according to the format specification.",
    message: "textscan: parse failed",
};
const TEXTSCAN_ERRORS: [BuiltinErrorDescriptor; 4] = [
    TEXTSCAN_ERROR_ARGUMENT,
    TEXTSCAN_ERROR_FORMAT,
    TEXTSCAN_ERROR_FILE,
    TEXTSCAN_ERROR_PARSE,
];

pub const TEXTSCAN_DESCRIPTOR: BuiltinDescriptor = BuiltinDescriptor {
    signatures: &TEXTSCAN_SIGNATURES,
    output_mode: BuiltinOutputMode::Fixed,
    completion_policy: BuiltinCompletionPolicy::Public,
    errors: &TEXTSCAN_ERRORS,
};

#[runmat_macros::register_gpu_spec(builtin_path = "crate::builtins::io::textscan")]
pub const GPU_SPEC: BuiltinGpuSpec = BuiltinGpuSpec {
    name: "textscan",
    op_kind: GpuOpKind::Custom("io-textscan"),
    supported_precisions: &[],
    broadcast: BroadcastSemantics::None,
    provider_hooks: &[],
    constant_strategy: ConstantStrategy::InlineLiteral,
    residency: ResidencyPolicy::GatherImmediately,
    nan_mode: ReductionNaN::Include,
    two_pass_threshold: None,
    workgroup_size: None,
    accepts_nan_mode: false,
    notes: "Runs on the host; formatted text import is not an acceleration operation.",
};

#[runmat_macros::register_fusion_spec(builtin_path = "crate::builtins::io::textscan")]
pub const FUSION_SPEC: BuiltinFusionSpec = BuiltinFusionSpec {
    name: "textscan",
    shape: ShapeRequirements::Any,
    constant_strategy: ConstantStrategy::InlineLiteral,
    elementwise: None,
    reduction: None,
    emits_nan: false,
    notes: "Not eligible for fusion; performs host-side formatted text parsing.",
};

fn textscan_error_with(
    error: &'static BuiltinErrorDescriptor,
    message: impl Into<String>,
) -> RuntimeError {
    let mut builder = build_runtime_error(message).with_builtin(BUILTIN_NAME);
    if let Some(identifier) = error.identifier {
        builder = builder.with_identifier(identifier);
    }
    builder.build()
}

fn textscan_error_with_source<E>(
    error: &'static BuiltinErrorDescriptor,
    message: impl Into<String>,
    source: E,
) -> RuntimeError
where
    E: std::error::Error + Send + Sync + 'static,
{
    let mut builder = build_runtime_error(message)
        .with_builtin(BUILTIN_NAME)
        .with_source(source);
    if let Some(identifier) = error.identifier {
        builder = builder.with_identifier(identifier);
    }
    builder.build()
}

fn map_control_flow(err: RuntimeError) -> RuntimeError {
    let identifier = err.identifier().map(|value| value.to_string());
    let message = err.message().to_string();
    let mut builder = build_runtime_error(message)
        .with_builtin(BUILTIN_NAME)
        .with_source(err);
    if let Some(identifier) = identifier {
        builder = builder.with_identifier(identifier);
    }
    builder.build()
}

#[runtime_builtin(
    name = "textscan",
    category = "io/import",
    summary = "Parse formatted text from a string or file identifier.",
    keywords = "textscan,formatted text,delimiter,header,format specifier,csv,log import",
    accel = "cpu",
    type_resolver(crate::builtins::io::type_resolvers::textscan_type),
    descriptor(crate::builtins::io::textscan::TEXTSCAN_DESCRIPTOR),
    builtin_path = "crate::builtins::io::textscan"
)]
async fn textscan_builtin(
    input: Value,
    format_spec: Value,
    rest: Vec<Value>,
) -> BuiltinResult<Value> {
    let input = gather_if_needed_async(&input)
        .await
        .map_err(map_control_flow)?;
    let format_spec = gather_if_needed_async(&format_spec)
        .await
        .map_err(map_control_flow)?;
    let format_spec = string_scalar(&format_spec, "formatSpec")?;
    let gathered_rest = gather_rest(rest).await?;
    let (repeat, options) = parse_args(&gathered_rest)?;
    let parsed = parse_input(&input, &format_spec, repeat, &options)?;
    build_output(parsed, &options)
}

async fn gather_rest(rest: Vec<Value>) -> BuiltinResult<Vec<Value>> {
    let mut out = Vec::with_capacity(rest.len());
    for value in rest {
        out.push(
            gather_if_needed_async(&value)
                .await
                .map_err(map_control_flow)?,
        );
    }
    Ok(out)
}

fn parse_input(
    value: &Value,
    format_spec: &str,
    repeat: Option<usize>,
    options: &TextscanOptions,
) -> BuiltinResult<Vec<ColumnData>> {
    if let Some(fid) = numeric_fid(value) {
        return parse_registered_file(fid, format_spec, repeat, options)
            .map(|parsed| parsed.columns);
    }
    let text = string_scalar(value, "textOrFileID")?;
    parse_textscan(&text, format_spec, repeat, options).map(|parsed| parsed.columns)
}

fn parse_registered_file(
    fid: i32,
    format_spec: &str,
    repeat: Option<usize>,
    options: &TextscanOptions,
) -> BuiltinResult<ParsedTextscan> {
    validate_fid(fid)?;
    let info = registry::info_for(fid).ok_or_else(|| {
        textscan_error_with(
            &TEXTSCAN_ERROR_FILE,
            format!("textscan: invalid file identifier {fid}"),
        )
    })?;
    if !permission_allows_read(&info.permission) {
        return Err(textscan_error_with(
            &TEXTSCAN_ERROR_FILE,
            format!("textscan: file identifier {fid} is not open for reading"),
        ));
    }
    let handle = registry::shared_handle(fid).ok_or_else(|| {
        textscan_error_with(
            &TEXTSCAN_ERROR_FILE,
            format!("textscan: invalid file identifier {fid}"),
        )
    })?;
    let mut guard = handle
        .lock()
        .map_err(|_| textscan_error_with(&TEXTSCAN_ERROR_FILE, "textscan: file handle poisoned"))?;
    let file = guard.as_mut().ok_or_else(|| {
        textscan_error_with(
            &TEXTSCAN_ERROR_FILE,
            format!("textscan: file identifier {fid} is closed"),
        )
    })?;
    let start = file.stream_position().map_err(|err| {
        textscan_error_with_source(
            &TEXTSCAN_ERROR_FILE,
            format!("textscan: unable to seek file identifier {fid} ({err})"),
            err,
        )
    })?;
    let mut bytes = Vec::new();
    file.read_to_end(&mut bytes).map_err(|err| {
        textscan_error_with_source(
            &TEXTSCAN_ERROR_FILE,
            format!("textscan: unable to read from file identifier {fid} ({err})"),
            err,
        )
    })?;
    let encoding = if info.encoding.trim().is_empty() {
        "UTF-8"
    } else {
        info.encoding.as_str()
    };
    let decoded = DecodedFileText::decode(&bytes, encoding)?;
    let parsed = parse_textscan(&decoded.text, format_spec, repeat, options)?;
    let consumed_bytes = decoded.byte_offset_for_text_pos(parsed.consumed_text_pos)?;
    let target = start.saturating_add(consumed_bytes as u64);
    file.seek(SeekFrom::Start(target)).map_err(|err| {
        textscan_error_with_source(
            &TEXTSCAN_ERROR_FILE,
            format!("textscan: unable to restore file position for identifier {fid} ({err})"),
            err,
        )
    })?;
    Ok(parsed)
}

fn validate_fid(fid: i32) -> BuiltinResult<()> {
    if fid < 0 {
        return Err(textscan_error_with(
            &TEXTSCAN_ERROR_FILE,
            "textscan: file identifier must be non-negative",
        ));
    }
    if fid < 3 {
        return Err(textscan_error_with(
            &TEXTSCAN_ERROR_FILE,
            "textscan: standard input/output identifiers are not supported yet",
        ));
    }
    Ok(())
}

fn permission_allows_read(permission: &str) -> bool {
    let lower = permission.to_ascii_lowercase();
    lower.starts_with('r') || lower.contains('+')
}

struct DecodedFileText {
    text: String,
    text_offsets: Vec<usize>,
    byte_offsets: Vec<usize>,
    byte_len: usize,
}

impl DecodedFileText {
    fn decode(bytes: &[u8], encoding: &str) -> BuiltinResult<Self> {
        if is_shift_jis_encoding(encoding) {
            return decode_shift_jis_with_offsets(bytes, encoding);
        }
        let chars = decode_bytes(bytes, encoding, BUILTIN_NAME)
            .map_err(|err| textscan_error_with(&TEXTSCAN_ERROR_FILE, err.message()))?;
        let byte_width = byte_preserving_encoding_width(encoding)?;
        let mut text = String::new();
        let mut text_offsets = Vec::with_capacity(chars.len());
        let mut byte_offsets = Vec::with_capacity(chars.len());
        let mut byte_offset = 0usize;
        for ch in chars {
            text_offsets.push(text.len());
            byte_offsets.push(byte_offset);
            text.push(ch);
            byte_offset += byte_width.unwrap_or_else(|| ch.len_utf8());
        }
        Ok(Self {
            text,
            text_offsets,
            byte_offsets,
            byte_len: bytes.len(),
        })
    }

    fn byte_offset_for_text_pos(&self, text_pos: usize) -> BuiltinResult<usize> {
        if text_pos == self.text.len() {
            return Ok(self.byte_len);
        }
        match self.text_offsets.binary_search(&text_pos) {
            Ok(idx) => Ok(self.byte_offsets[idx]),
            Err(idx) if idx == self.text_offsets.len() => Ok(self.byte_len),
            Err(_) => Err(textscan_error_with(
                &TEXTSCAN_ERROR_FILE,
                "textscan: parsed position did not fall on a decoded character boundary",
            )),
        }
    }
}

fn decode_shift_jis_with_offsets(bytes: &[u8], encoding: &str) -> BuiltinResult<DecodedFileText> {
    let mut text = String::new();
    let mut text_offsets = Vec::new();
    let mut byte_offsets = Vec::new();
    let mut byte_offset = 0usize;
    while byte_offset < bytes.len() {
        let width = shift_jis_unit_width(bytes, byte_offset)?;
        let decoded = decode_bytes(
            &bytes[byte_offset..byte_offset + width],
            encoding,
            BUILTIN_NAME,
        )
        .map_err(|err| textscan_error_with(&TEXTSCAN_ERROR_FILE, err.message()))?;
        for ch in decoded {
            text_offsets.push(text.len());
            byte_offsets.push(byte_offset);
            text.push(ch);
        }
        byte_offset += width;
    }
    Ok(DecodedFileText {
        text,
        text_offsets,
        byte_offsets,
        byte_len: bytes.len(),
    })
}

fn shift_jis_unit_width(bytes: &[u8], offset: usize) -> BuiltinResult<usize> {
    let first = bytes[offset];
    if first <= 0x80 || (0xA1..=0xDF).contains(&first) {
        return Ok(1);
    }
    if (0x81..=0x9F).contains(&first) || (0xE0..=0xFC).contains(&first) {
        let Some(&second) = bytes.get(offset + 1) else {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_FILE,
                "textscan: incomplete Shift_JIS character at end of file",
            ));
        };
        if (0x40..=0x7E).contains(&second) || (0x80..=0xFC).contains(&second) {
            return Ok(2);
        }
    }
    Err(textscan_error_with(
        &TEXTSCAN_ERROR_FILE,
        "textscan: invalid Shift_JIS byte sequence",
    ))
}

fn is_shift_jis_encoding(encoding: &str) -> bool {
    Encoding::for_label(encoding.trim().as_bytes()) == Some(SHIFT_JIS)
}

fn byte_preserving_encoding_width(encoding: &str) -> BuiltinResult<Option<usize>> {
    let label = encoding.trim();
    if label.is_empty() || label.eq_ignore_ascii_case("utf-8") || label.eq_ignore_ascii_case("utf8")
    {
        return Ok(None);
    }
    if is_shift_jis_encoding(label) {
        return Ok(None);
    }
    if label.eq_ignore_ascii_case("binary")
        || label.eq_ignore_ascii_case("latin1")
        || label.eq_ignore_ascii_case("latin-1")
        || label.eq_ignore_ascii_case("iso-8859-1")
        || label.eq_ignore_ascii_case("windows-1252")
        || label.eq_ignore_ascii_case("cp1252")
        || label.eq_ignore_ascii_case("us-ascii")
        || label.eq_ignore_ascii_case("ascii")
        || label.eq_ignore_ascii_case("us_ascii")
        || label.eq_ignore_ascii_case("usascii")
    {
        return Ok(Some(1));
    }
    Err(textscan_error_with(
        &TEXTSCAN_ERROR_FILE,
        format!(
            "textscan: file-position preserving reads do not yet support encoding '{encoding}'"
        ),
    ))
}

#[derive(Debug, Clone)]
struct TextscanOptions {
    delimiters: Vec<String>,
    whitespace: String,
    multiple_delims_as_one: bool,
    header_lines: usize,
    treat_as_empty: Vec<String>,
    comment_style: CommentStyle,
    collect_output: bool,
    return_on_error: bool,
}

impl Default for TextscanOptions {
    fn default() -> Self {
        Self {
            delimiters: Vec::new(),
            whitespace: " \u{0008}\t".to_string(),
            multiple_delims_as_one: false,
            header_lines: 0,
            treat_as_empty: Vec::new(),
            comment_style: CommentStyle::None,
            collect_output: false,
            return_on_error: true,
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
enum CommentStyle {
    None,
    Line(Vec<String>),
    Block { start: String, end: String },
}

fn parse_args(args: &[Value]) -> BuiltinResult<(Option<usize>, TextscanOptions)> {
    let mut idx = 0usize;
    let repeat = if let Some(value) = args.first() {
        if is_numeric_scalar(value) {
            idx = 1;
            Some(nonnegative_usize(value, "repeat count")?)
        } else {
            None
        }
    } else {
        None
    };
    if !(args.len() - idx).is_multiple_of(2) {
        return Err(textscan_error_with(
            &TEXTSCAN_ERROR_ARGUMENT,
            "textscan: options must be provided as name-value pairs",
        ));
    }
    let mut options = TextscanOptions::default();
    while idx < args.len() {
        let name = string_scalar(&args[idx], "option name")?;
        let value = &args[idx + 1];
        apply_option(&mut options, &name, value)?;
        idx += 2;
    }
    Ok((repeat, options))
}

fn apply_option(options: &mut TextscanOptions, name: &str, value: &Value) -> BuiltinResult<()> {
    match normalize_name(name).as_str() {
        "delimiter" => options.delimiters = delimiter_list(value)?,
        "multipledelimsasone" => options.multiple_delims_as_one = bool_like(value, name)?,
        "headerlines" => options.header_lines = nonnegative_usize(value, name)?,
        "treatasempty" => options.treat_as_empty = string_list(value, name)?,
        "commentstyle" => options.comment_style = parse_comment_style(value)?,
        "collectoutput" => options.collect_output = bool_like(value, name)?,
        "returnonerror" => options.return_on_error = bool_like(value, name)?,
        "whitespace" => options.whitespace = string_scalar(value, name)?,
        "emptyvalue" | "endofline" | "bufsize" | "expchars" | "texttype" | "datelocale" => {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_ARGUMENT,
                format!("textscan: option '{name}' is not implemented yet"),
            ));
        }
        other => {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_ARGUMENT,
                format!("textscan: unsupported option '{other}'"),
            ));
        }
    }
    Ok(())
}

fn normalize_name(name: &str) -> String {
    name.chars()
        .filter(|ch| *ch != '_' && *ch != ' ')
        .flat_map(char::to_lowercase)
        .collect()
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct FormatItem {
    kind: FormatKind,
    skip: bool,
    width: Option<usize>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
enum FormatKind {
    Float,
    SignedInt,
    UnsignedInt,
    String,
    QuotedString,
    Char,
    CharSet { chars: HashSet<char>, negated: bool },
}

impl FormatKind {
    fn output_kind(&self) -> OutputKind {
        match self {
            FormatKind::Float | FormatKind::SignedInt | FormatKind::UnsignedInt => {
                OutputKind::Numeric
            }
            FormatKind::String
            | FormatKind::QuotedString
            | FormatKind::Char
            | FormatKind::CharSet { .. } => OutputKind::Text,
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum OutputKind {
    Numeric,
    Text,
}

#[derive(Debug, Clone, PartialEq, Eq)]
enum FormatElement {
    Conversion(FormatItem),
    Literal(String),
    Whitespace,
}

fn parse_format_spec(format: &str) -> BuiltinResult<Vec<FormatElement>> {
    let mut elements = Vec::new();
    let mut conversion_count = 0usize;
    let chars: Vec<char> = format.chars().collect();
    let mut idx = 0usize;
    while idx < chars.len() {
        if chars[idx] != '%' {
            if chars[idx].is_whitespace() {
                while idx < chars.len() && chars[idx].is_whitespace() {
                    idx += 1;
                }
                if !matches!(elements.last(), Some(FormatElement::Whitespace)) {
                    elements.push(FormatElement::Whitespace);
                }
                continue;
            }
            let start = idx;
            while idx < chars.len() && chars[idx] != '%' && !chars[idx].is_whitespace() {
                idx += 1;
            }
            elements.push(FormatElement::Literal(chars[start..idx].iter().collect()));
            continue;
        }
        idx += 1;
        if idx < chars.len() && chars[idx] == '%' {
            elements.push(FormatElement::Literal("%".to_string()));
            idx += 1;
            continue;
        }
        let mut skip = false;
        if idx < chars.len() && chars[idx] == '*' {
            skip = true;
            idx += 1;
        }
        let width_start = idx;
        while idx < chars.len() && chars[idx].is_ascii_digit() {
            idx += 1;
        }
        let width = if idx > width_start {
            Some(
                chars[width_start..idx]
                    .iter()
                    .collect::<String>()
                    .parse::<usize>()
                    .map_err(|_| {
                        textscan_error_with(&TEXTSCAN_ERROR_FORMAT, "textscan: invalid field width")
                    })?,
            )
        } else {
            None
        };
        if idx >= chars.len() {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_FORMAT,
                "textscan: incomplete conversion specifier",
            ));
        }
        let kind = match chars[idx] {
            'f' | 'e' | 'E' | 'g' | 'G' | 'n' => {
                idx += 1;
                FormatKind::Float
            }
            'd' | 'i' => {
                idx += 1;
                FormatKind::SignedInt
            }
            'u' => {
                idx += 1;
                FormatKind::UnsignedInt
            }
            's' => {
                idx += 1;
                FormatKind::String
            }
            'q' => {
                idx += 1;
                FormatKind::QuotedString
            }
            'c' => {
                idx += 1;
                FormatKind::Char
            }
            '[' => {
                idx += 1;
                let mut negated = false;
                if idx < chars.len() && chars[idx] == '^' {
                    negated = true;
                    idx += 1;
                }
                let mut set = HashSet::new();
                while idx < chars.len() && chars[idx] != ']' {
                    set.insert(chars[idx]);
                    idx += 1;
                }
                if idx >= chars.len() {
                    return Err(textscan_error_with(
                        &TEXTSCAN_ERROR_FORMAT,
                        "textscan: unterminated character set conversion",
                    ));
                }
                idx += 1;
                FormatKind::CharSet {
                    chars: set,
                    negated,
                }
            }
            other => {
                return Err(textscan_error_with(
                    &TEXTSCAN_ERROR_FORMAT,
                    format!("textscan: unsupported conversion '%{other}'"),
                ));
            }
        };
        elements.push(FormatElement::Conversion(FormatItem { kind, skip, width }));
        conversion_count += 1;
    }
    if conversion_count == 0 {
        return Err(textscan_error_with(
            &TEXTSCAN_ERROR_FORMAT,
            "textscan: formatSpec must contain at least one conversion",
        ));
    }
    Ok(elements)
}

#[derive(Debug, Clone)]
enum ColumnData {
    Numeric(Vec<f64>),
    Text(Vec<String>),
}

impl ColumnData {
    fn new(kind: OutputKind) -> Self {
        match kind {
            OutputKind::Numeric => ColumnData::Numeric(Vec::new()),
            OutputKind::Text => ColumnData::Text(Vec::new()),
        }
    }

    fn kind(&self) -> OutputKind {
        match self {
            ColumnData::Numeric(_) => OutputKind::Numeric,
            ColumnData::Text(_) => OutputKind::Text,
        }
    }

    fn len(&self) -> usize {
        match self {
            ColumnData::Numeric(values) => values.len(),
            ColumnData::Text(values) => values.len(),
        }
    }

    fn truncate(&mut self, len: usize) {
        match self {
            ColumnData::Numeric(values) => values.truncate(len),
            ColumnData::Text(values) => values.truncate(len),
        }
    }

    fn push_numeric(&mut self, value: f64) {
        let ColumnData::Numeric(values) = self else {
            unreachable!("numeric pushed into text column");
        };
        values.push(value);
    }

    fn push_text(&mut self, value: String) {
        let ColumnData::Text(values) = self else {
            unreachable!("text pushed into numeric column");
        };
        values.push(value);
    }
}

fn parse_textscan(
    text: &str,
    format: &str,
    repeat: Option<usize>,
    options: &TextscanOptions,
) -> BuiltinResult<ParsedTextscan> {
    let elements = parse_format_spec(format)?;
    let output_kinds: Vec<OutputKind> = elements
        .iter()
        .filter_map(|element| match element {
            FormatElement::Conversion(item) if !item.skip => Some(item.kind.output_kind()),
            _ => None,
        })
        .collect();
    let mut columns: Vec<ColumnData> = output_kinds.into_iter().map(ColumnData::new).collect();
    let mut scanner = TextScanner::new(text, options);
    scanner.skip_header_lines();
    let mut records = 0usize;
    while !scanner.is_eof() && repeat.map(|limit| records < limit).unwrap_or(true) {
        scanner.skip_separators();
        if scanner.is_eof() {
            break;
        }
        let row_len = columns.first().map(ColumnData::len).unwrap_or(0);
        let mut output_idx = 0usize;
        let row_start = scanner.pos;
        let mut failed = false;
        for idx in 0..elements.len() {
            match &elements[idx] {
                FormatElement::Whitespace => scanner.skip_format_whitespace(),
                FormatElement::Literal(literal) => {
                    if let Err(err) = scanner.consume_literal(literal) {
                        if options.return_on_error {
                            failed = true;
                            break;
                        }
                        return Err(err);
                    }
                }
                FormatElement::Conversion(item) => {
                    let next_literal = next_literal(&elements[idx + 1..]);
                    let parsed = match scanner.parse_conversion(item, next_literal) {
                        Ok(parsed) => parsed,
                        Err(_) if options.return_on_error => {
                            failed = true;
                            break;
                        }
                        Err(err) => return Err(err),
                    };
                    let Some(parsed) = parsed else {
                        continue;
                    };
                    match parsed {
                        ParsedValue::Number(value) => {
                            if !item.skip {
                                columns[output_idx].push_numeric(value);
                                output_idx += 1;
                            }
                        }
                        ParsedValue::Text(value) => {
                            if !item.skip {
                                columns[output_idx].push_text(value);
                                output_idx += 1;
                            }
                        }
                    }
                }
            }
        }
        if failed || output_idx < columns.len() {
            for column in &mut columns {
                column.truncate(row_len);
            }
            break;
        }
        if scanner.pos == row_start {
            break;
        }
        records += 1;
        scanner.skip_separators();
    }
    Ok(ParsedTextscan {
        columns,
        consumed_text_pos: scanner.pos,
    })
}

#[derive(Debug, Clone)]
struct ParsedTextscan {
    columns: Vec<ColumnData>,
    consumed_text_pos: usize,
}

fn next_literal(elements: &[FormatElement]) -> Option<&str> {
    if let Some(element) = elements.first() {
        return match element {
            FormatElement::Literal(literal) => Some(literal),
            FormatElement::Whitespace | FormatElement::Conversion(_) => None,
        };
    }
    None
}

struct TextScanner<'a> {
    text: &'a str,
    pos: usize,
    options: &'a TextscanOptions,
    whitespace: HashSet<char>,
    delimiters: Vec<String>,
}

impl<'a> TextScanner<'a> {
    fn new(text: &'a str, options: &'a TextscanOptions) -> Self {
        let mut delimiters = options.delimiters.clone();
        delimiters.push("\r\n".to_string());
        delimiters.push("\n".to_string());
        delimiters.push("\r".to_string());
        delimiters.sort_by_key(|delimiter| std::cmp::Reverse(delimiter.len()));
        Self {
            text,
            pos: 0,
            options,
            whitespace: options.whitespace.chars().collect(),
            delimiters,
        }
    }

    fn is_eof(&self) -> bool {
        self.pos >= self.text.len()
    }

    fn current_char(&self) -> Option<char> {
        self.text[self.pos..].chars().next()
    }

    fn skip_header_lines(&mut self) {
        for _ in 0..self.options.header_lines {
            while let Some(ch) = self.current_char() {
                self.pos += ch.len_utf8();
                if ch == '\n' {
                    break;
                }
            }
        }
    }

    fn skip_format_whitespace(&mut self) {
        while let Some(ch) = self.current_char() {
            if !ch.is_whitespace() {
                break;
            }
            self.pos += ch.len_utf8();
        }
    }

    fn skip_separators(&mut self) {
        loop {
            if self.skip_comment() {
                continue;
            }
            if let Some(delimiter) = self.match_delimiter() {
                self.pos += delimiter.len();
                continue;
            }
            let Some(ch) = self.current_char() else {
                break;
            };
            if ch.is_whitespace() || self.whitespace.contains(&ch) {
                self.pos += ch.len_utf8();
                continue;
            }
            break;
        }
    }

    fn consume_literal(&mut self, literal: &str) -> BuiltinResult<()> {
        if self.text[self.pos..].starts_with(literal) {
            self.pos += literal.len();
            return Ok(());
        }
        Err(textscan_error_with(
            &TEXTSCAN_ERROR_PARSE,
            format!("textscan: expected literal '{literal}'"),
        ))
    }

    fn parse_conversion(
        &mut self,
        item: &FormatItem,
        next_literal: Option<&str>,
    ) -> BuiltinResult<Option<ParsedValue>> {
        if !matches!(item.kind, FormatKind::Char | FormatKind::CharSet { .. }) {
            self.skip_separators();
        }
        let parsed = match &item.kind {
            FormatKind::Float => ParsedValue::Number(self.parse_numeric_field(
                item.width,
                next_literal,
                parse_float,
            )?),
            FormatKind::SignedInt => ParsedValue::Number(self.parse_numeric_field(
                item.width,
                next_literal,
                |field| parse_signed_int(field).map(|value| value as f64),
            )?),
            FormatKind::UnsignedInt => ParsedValue::Number(self.parse_numeric_field(
                item.width,
                next_literal,
                |field| parse_unsigned_int(field).map(|value| value as f64),
            )?),
            FormatKind::String => {
                ParsedValue::Text(self.read_field(item.width, next_literal, true)?)
            }
            FormatKind::QuotedString => {
                ParsedValue::Text(self.read_quoted_or_field(item.width, next_literal)?)
            }
            FormatKind::Char => ParsedValue::Text(self.read_chars(item.width.unwrap_or(1))?),
            FormatKind::CharSet { chars, negated } => {
                ParsedValue::Text(self.read_charset(chars, *negated, item.width)?)
            }
        };
        if item.skip {
            Ok(None)
        } else {
            Ok(Some(parsed))
        }
    }

    fn parse_numeric_field(
        &mut self,
        width: Option<usize>,
        next_literal: Option<&str>,
        parse: impl FnOnce(&str) -> BuiltinResult<f64>,
    ) -> BuiltinResult<f64> {
        let field = self.read_field(width, next_literal, false)?;
        if self
            .options
            .treat_as_empty
            .iter()
            .any(|empty| empty == &field)
        {
            return Ok(f64::NAN);
        }
        parse(&field)
    }

    fn read_field(
        &mut self,
        width: Option<usize>,
        next_literal: Option<&str>,
        allow_treat_empty: bool,
    ) -> BuiltinResult<String> {
        let start = self.pos;
        let mut chars = 0usize;
        while !self.is_eof() {
            if width.map(|limit| chars >= limit).unwrap_or(false) {
                break;
            }
            if next_literal
                .filter(|literal| self.text[self.pos..].starts_with(*literal))
                .is_some()
            {
                break;
            }
            if self.is_at_separator() || self.is_at_comment() {
                break;
            }
            let Some(ch) = self.current_char() else {
                break;
            };
            self.pos += ch.len_utf8();
            chars += 1;
        }
        let field = self.text[start..self.pos].trim().to_string();
        if allow_treat_empty
            && self
                .options
                .treat_as_empty
                .iter()
                .any(|empty| empty == &field)
        {
            return Ok(String::new());
        }
        if field.is_empty() {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_PARSE,
                "textscan: empty field",
            ));
        }
        Ok(field)
    }

    fn read_quoted_or_field(
        &mut self,
        width: Option<usize>,
        next_literal: Option<&str>,
    ) -> BuiltinResult<String> {
        if self.current_char() != Some('"') {
            return self.read_field(width, next_literal, true);
        }
        self.pos += 1;
        let mut out = String::new();
        while let Some(ch) = self.current_char() {
            self.pos += ch.len_utf8();
            if ch == '"' {
                if self.current_char() == Some('"') {
                    self.pos += 1;
                    out.push('"');
                    continue;
                }
                return Ok(out);
            }
            if width
                .map(|limit| out.chars().count() >= limit)
                .unwrap_or(false)
            {
                return Ok(out);
            }
            out.push(ch);
        }
        Err(textscan_error_with(
            &TEXTSCAN_ERROR_PARSE,
            "textscan: unterminated quoted field",
        ))
    }

    fn read_chars(&mut self, count: usize) -> BuiltinResult<String> {
        let mut out = String::new();
        for _ in 0..count {
            let Some(ch) = self.current_char() else {
                return Err(textscan_error_with(
                    &TEXTSCAN_ERROR_PARSE,
                    "textscan: not enough characters for %c conversion",
                ));
            };
            self.pos += ch.len_utf8();
            out.push(ch);
        }
        Ok(out)
    }

    fn read_charset(
        &mut self,
        chars: &HashSet<char>,
        negated: bool,
        width: Option<usize>,
    ) -> BuiltinResult<String> {
        let mut out = String::new();
        while let Some(ch) = self.current_char() {
            if width
                .map(|limit| out.chars().count() >= limit)
                .unwrap_or(false)
            {
                break;
            }
            if !(chars.contains(&ch) ^ negated) {
                break;
            }
            self.pos += ch.len_utf8();
            out.push(ch);
        }
        if out.is_empty() {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_PARSE,
                "textscan: character set conversion matched no characters",
            ));
        }
        Ok(out)
    }

    fn is_at_separator(&self) -> bool {
        self.match_delimiter().is_some()
            || self
                .current_char()
                .map(|ch| ch.is_whitespace() || self.whitespace.contains(&ch))
                .unwrap_or(false)
    }

    fn match_delimiter(&self) -> Option<&str> {
        self.delimiters
            .iter()
            .find(|delimiter| self.text[self.pos..].starts_with(delimiter.as_str()))
            .map(String::as_str)
    }

    fn skip_comment(&mut self) -> bool {
        match &self.options.comment_style {
            CommentStyle::None => false,
            CommentStyle::Line(markers) => {
                if markers
                    .iter()
                    .any(|marker| !marker.is_empty() && self.text[self.pos..].starts_with(marker))
                {
                    while let Some(ch) = self.current_char() {
                        self.pos += ch.len_utf8();
                        if ch == '\n' {
                            break;
                        }
                    }
                    true
                } else {
                    false
                }
            }
            CommentStyle::Block { start, end } => {
                if start.is_empty() || !self.text[self.pos..].starts_with(start) {
                    return false;
                }
                let after_start = self.pos + start.len();
                if let Some(end_idx) = self.text[after_start..].find(end) {
                    self.pos = after_start + end_idx + end.len();
                } else {
                    self.pos = self.text.len();
                }
                true
            }
        }
    }

    fn is_at_comment(&self) -> bool {
        match &self.options.comment_style {
            CommentStyle::None => false,
            CommentStyle::Line(markers) => markers
                .iter()
                .any(|marker| !marker.is_empty() && self.text[self.pos..].starts_with(marker)),
            CommentStyle::Block { start, .. } => {
                !start.is_empty() && self.text[self.pos..].starts_with(start)
            }
        }
    }
}

#[derive(Debug, Clone)]
enum ParsedValue {
    Number(f64),
    Text(String),
}

fn parse_float(token: &str) -> BuiltinResult<f64> {
    match token.trim().to_ascii_lowercase().as_str() {
        "" => Ok(f64::NAN),
        "nan" => Ok(f64::NAN),
        "inf" | "+inf" | "infinity" | "+infinity" => Ok(f64::INFINITY),
        "-inf" | "-infinity" => Ok(f64::NEG_INFINITY),
        _ => token.trim().parse::<f64>().map_err(|_| {
            textscan_error_with(
                &TEXTSCAN_ERROR_PARSE,
                format!("textscan: cannot parse '{token}' as a floating-point value"),
            )
        }),
    }
}

fn parse_signed_int(token: &str) -> BuiltinResult<i64> {
    token.trim().parse::<i64>().map_err(|_| {
        textscan_error_with(
            &TEXTSCAN_ERROR_PARSE,
            format!("textscan: cannot parse '{token}' as an integer value"),
        )
    })
}

fn parse_unsigned_int(token: &str) -> BuiltinResult<u64> {
    token.trim().parse::<u64>().map_err(|_| {
        textscan_error_with(
            &TEXTSCAN_ERROR_PARSE,
            format!("textscan: cannot parse '{token}' as an unsigned integer value"),
        )
    })
}

fn build_output(columns: Vec<ColumnData>, options: &TextscanOptions) -> BuiltinResult<Value> {
    let values = if options.collect_output {
        collect_output(columns)?
    } else {
        columns
            .into_iter()
            .map(column_to_value)
            .collect::<BuiltinResult<Vec<_>>>()?
    };
    let len = values.len();
    CellArray::new(values, 1, len)
        .map(Value::Cell)
        .map_err(|err| textscan_error_with(&TEXTSCAN_ERROR_PARSE, format!("textscan: {err}")))
}

fn collect_output(columns: Vec<ColumnData>) -> BuiltinResult<Vec<Value>> {
    let mut out = Vec::new();
    let mut idx = 0usize;
    while idx < columns.len() {
        if columns[idx].kind() == OutputKind::Numeric {
            let start = idx;
            while idx < columns.len() && columns[idx].kind() == OutputKind::Numeric {
                idx += 1;
            }
            out.push(numeric_group_to_value(&columns[start..idx])?);
        } else {
            out.push(column_to_value(columns[idx].clone())?);
            idx += 1;
        }
    }
    Ok(out)
}

fn column_to_value(column: ColumnData) -> BuiltinResult<Value> {
    match column {
        ColumnData::Numeric(values) => Tensor::new(values.clone(), vec![values.len(), 1])
            .map(Value::Tensor)
            .map_err(|err| textscan_error_with(&TEXTSCAN_ERROR_PARSE, format!("textscan: {err}"))),
        ColumnData::Text(values) => cell_string_column(&values),
    }
}

fn numeric_group_to_value(columns: &[ColumnData]) -> BuiltinResult<Value> {
    let rows = columns.first().map(ColumnData::len).unwrap_or(0);
    let cols = columns.len();
    let mut data = Vec::with_capacity(rows * cols);
    for column in columns {
        let ColumnData::Numeric(values) = column else {
            unreachable!("numeric group contains text column");
        };
        if values.len() != rows {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_PARSE,
                "textscan: collected numeric columns have inconsistent lengths",
            ));
        }
        data.extend_from_slice(values);
    }
    Tensor::new(data, vec![rows, cols])
        .map(Value::Tensor)
        .map_err(|err| textscan_error_with(&TEXTSCAN_ERROR_PARSE, format!("textscan: {err}")))
}

fn cell_string_column(values: &[String]) -> BuiltinResult<Value> {
    CellArray::new(
        values.iter().cloned().map(Value::String).collect(),
        values.len(),
        1,
    )
    .map(Value::Cell)
    .map_err(|err| textscan_error_with(&TEXTSCAN_ERROR_PARSE, format!("textscan: {err}")))
}

fn string_scalar(value: &Value, context: &str) -> BuiltinResult<String> {
    match value {
        Value::String(s) => Ok(s.clone()),
        Value::CharArray(ca) if ca.rows == 1 => Ok(ca.data.iter().collect()),
        Value::StringArray(sa) if sa.data.len() == 1 => Ok(sa.data[0].clone()),
        _ => Err(textscan_error_with(
            &TEXTSCAN_ERROR_ARGUMENT,
            format!("textscan: expected {context} as a string scalar or character vector"),
        )),
    }
}

fn string_list(value: &Value, context: &str) -> BuiltinResult<Vec<String>> {
    match value {
        Value::Cell(cell) => {
            let mut out = Vec::with_capacity(cell.data.len());
            for row in 0..cell.rows {
                for col in 0..cell.cols {
                    out.push(string_scalar(
                        &cell.get(row, col).map_err(|err| {
                            textscan_error_with(
                                &TEXTSCAN_ERROR_ARGUMENT,
                                format!("textscan: {err}"),
                            )
                        })?,
                        context,
                    )?);
                }
            }
            Ok(out)
        }
        Value::StringArray(sa) => Ok(sa.data.clone()),
        _ => Ok(vec![string_scalar(value, context)?]),
    }
}

fn delimiter_list(value: &Value) -> BuiltinResult<Vec<String>> {
    let mut delimiters = string_list(value, "Delimiter")?;
    for delimiter in &mut delimiters {
        *delimiter = match delimiter.as_str() {
            "\\t" => "\t".to_string(),
            "\\n" => "\n".to_string(),
            "\\r" => "\r".to_string(),
            other => other.to_string(),
        };
        if delimiter.is_empty() {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_ARGUMENT,
                "textscan: Delimiter entries must not be empty",
            ));
        }
    }
    Ok(delimiters)
}

fn parse_comment_style(value: &Value) -> BuiltinResult<CommentStyle> {
    match value {
        Value::String(s) if s.eq_ignore_ascii_case("none") => Ok(CommentStyle::None),
        Value::CharArray(ca) if ca.rows == 1 => {
            let text: String = ca.data.iter().collect();
            if text.eq_ignore_ascii_case("none") {
                Ok(CommentStyle::None)
            } else {
                Ok(CommentStyle::Line(vec![text]))
            }
        }
        Value::Cell(cell) if cell.data.len() == 2 => {
            let first = string_scalar(
                &cell.get(0, 0).map_err(|err| {
                    textscan_error_with(&TEXTSCAN_ERROR_ARGUMENT, format!("textscan: {err}"))
                })?,
                "CommentStyle",
            )?;
            let second = if cell.rows == 1 {
                string_scalar(
                    &cell.get(0, 1).map_err(|err| {
                        textscan_error_with(&TEXTSCAN_ERROR_ARGUMENT, format!("textscan: {err}"))
                    })?,
                    "CommentStyle",
                )?
            } else {
                string_scalar(
                    &cell.get(1, 0).map_err(|err| {
                        textscan_error_with(&TEXTSCAN_ERROR_ARGUMENT, format!("textscan: {err}"))
                    })?,
                    "CommentStyle",
                )?
            };
            Ok(CommentStyle::Block {
                start: first,
                end: second,
            })
        }
        Value::Cell(cell) => {
            let mut markers = Vec::new();
            for row in 0..cell.rows {
                for col in 0..cell.cols {
                    markers.push(string_scalar(
                        &cell.get(row, col).map_err(|err| {
                            textscan_error_with(
                                &TEXTSCAN_ERROR_ARGUMENT,
                                format!("textscan: {err}"),
                            )
                        })?,
                        "CommentStyle",
                    )?);
                }
            }
            Ok(CommentStyle::Line(markers))
        }
        _ => {
            let text = string_scalar(value, "CommentStyle")?;
            if text.eq_ignore_ascii_case("none") {
                Ok(CommentStyle::None)
            } else {
                Ok(CommentStyle::Line(vec![text]))
            }
        }
    }
}

fn bool_like(value: &Value, context: &str) -> BuiltinResult<bool> {
    match value {
        Value::Bool(value) => Ok(*value),
        Value::Num(value) if (*value - 0.0).abs() < f64::EPSILON => Ok(false),
        Value::Num(value) if (*value - 1.0).abs() < f64::EPSILON => Ok(true),
        Value::Int(value) if value.to_i64() == 0 => Ok(false),
        Value::Int(value) if value.to_i64() == 1 => Ok(true),
        _ => match string_scalar(value, context)?
            .trim()
            .to_ascii_lowercase()
            .as_str()
        {
            "true" | "on" | "yes" | "1" => Ok(true),
            "false" | "off" | "no" | "0" => Ok(false),
            _ => Err(textscan_error_with(
                &TEXTSCAN_ERROR_ARGUMENT,
                format!("textscan: {context} must be logical"),
            )),
        },
    }
}

fn nonnegative_usize(value: &Value, context: &str) -> BuiltinResult<usize> {
    let raw = match value {
        Value::Num(value) => *value,
        Value::Int(value) => value.to_i64() as f64,
        Value::Tensor(tensor) if tensor.data.len() == 1 => tensor.data[0],
        _ => {
            return Err(textscan_error_with(
                &TEXTSCAN_ERROR_ARGUMENT,
                format!("textscan: {context} must be a nonnegative integer scalar"),
            ));
        }
    };
    if !raw.is_finite() || raw < 0.0 || raw.fract() != 0.0 {
        return Err(textscan_error_with(
            &TEXTSCAN_ERROR_ARGUMENT,
            format!("textscan: {context} must be a nonnegative integer scalar"),
        ));
    }
    Ok(raw as usize)
}

fn numeric_fid(value: &Value) -> Option<i32> {
    let raw = match value {
        Value::Num(value) => *value,
        Value::Int(value) => value.to_i64() as f64,
        Value::Tensor(tensor) if tensor.data.len() == 1 => tensor.data[0],
        _ => return None,
    };
    if raw.is_finite() && raw.fract() == 0.0 && raw >= i32::MIN as f64 && raw <= i32::MAX as f64 {
        Some(raw as i32)
    } else {
        None
    }
}

fn is_numeric_scalar(value: &Value) -> bool {
    numeric_fid(value).is_some()
}

#[cfg(test)]
mod tests {
    use super::*;
    use futures::executor::block_on;
    use runmat_filesystem::OpenOptions;
    use std::sync::{Arc, Mutex as StdMutex};

    use crate::builtins::io::filetext::registry::RegisteredFile;

    fn output_cell(value: &Value) -> &CellArray {
        let Value::Cell(cell) = value else {
            panic!("expected cell array output");
        };
        cell
    }

    fn output_value(value: &Value, col: usize) -> Value {
        output_cell(value).get(0, col).expect("output cell")
    }

    fn numeric_column(value: &Value, col: usize) -> Vec<f64> {
        let Value::Tensor(tensor) = output_value(value, col) else {
            panic!("expected tensor");
        };
        tensor.data
    }

    fn text_column(value: &Value, col: usize) -> Vec<String> {
        let Value::Cell(cell) = output_value(value, col) else {
            panic!("expected text cell column");
        };
        let mut out = Vec::new();
        for row in 0..cell.rows {
            let Value::String(text) = cell.get(row, 0).expect("text cell") else {
                panic!("expected string");
            };
            out.push(text);
        }
        out
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_descriptor_covers_core_forms() {
        let labels: Vec<&str> = TEXTSCAN_DESCRIPTOR
            .signatures
            .iter()
            .map(|sig| sig.label)
            .collect();
        assert!(labels.contains(&"C = textscan(textOrFileID, formatSpec)"));
        assert!(labels.contains(&"C = textscan(textOrFileID, formatSpec, args...)"));
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_reads_mixed_columns_from_text() {
        let out = block_on(textscan_builtin(
            Value::from("1 alpha\n2 beta\n"),
            Value::from("%f %s"),
            Vec::new(),
        ))
        .expect("textscan");
        assert_eq!(numeric_column(&out, 0), vec![1.0, 2.0]);
        assert_eq!(
            text_column(&out, 1),
            vec!["alpha".to_string(), "beta".to_string()]
        );
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_honors_delimiter_header_comments_and_treat_empty() {
        let text = "# header\nA,1.5\nB,NA\n% ignored\nC,3.5\n";
        let out = block_on(textscan_builtin(
            Value::from(text),
            Value::from("%s %f"),
            vec![
                Value::from("Delimiter"),
                Value::from(","),
                Value::from("HeaderLines"),
                Value::Num(1.0),
                Value::from("CommentStyle"),
                Value::from("%"),
                Value::from("TreatAsEmpty"),
                Value::from("NA"),
            ],
        ))
        .expect("textscan");
        assert_eq!(
            text_column(&out, 0),
            vec!["A".to_string(), "B".to_string(), "C".to_string()]
        );
        let nums = numeric_column(&out, 1);
        assert_eq!(nums[0], 1.5);
        assert!(nums[1].is_nan());
        assert_eq!(nums[2], 3.5);
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_supports_repeat_skip_collect_and_quotes() {
        let out = block_on(textscan_builtin(
            Value::from("1,drop,2,\"hello, world\"\n3,drop,4,\"tail\"\n"),
            Value::from("%f %*s %f %q"),
            vec![
                Value::Num(1.0),
                Value::from("Delimiter"),
                Value::from(","),
                Value::from("CollectOutput"),
                Value::Bool(true),
            ],
        ))
        .expect("textscan");
        let Value::Tensor(group) = output_value(&out, 0) else {
            panic!("expected collected numeric group");
        };
        assert_eq!(group.shape, vec![1, 2]);
        assert_eq!(group.data, vec![1.0, 2.0]);
        assert_eq!(text_column(&out, 1), vec!["hello, world".to_string()]);
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_return_on_error_false_reports_parse_failure() {
        let err = block_on(textscan_builtin(
            Value::from("1\nbad\n"),
            Value::from("%f"),
            vec![Value::from("ReturnOnError"), Value::Bool(false)],
        ))
        .expect_err("parse failure");
        assert!(err.message().contains("cannot parse"));
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_honors_literals_in_format_spec() {
        let out = block_on(textscan_builtin(
            Value::from("1,2\n3,4\n"),
            Value::from("%f,%f"),
            Vec::new(),
        ))
        .expect("textscan");
        assert_eq!(numeric_column(&out, 0), vec![1.0, 3.0]);
        assert_eq!(numeric_column(&out, 1), vec![2.0, 4.0]);
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_width_and_char_conversions_leave_remainder() {
        let strings = block_on(textscan_builtin(
            Value::from("abcdef"),
            Value::from("%2s%s"),
            Vec::new(),
        ))
        .expect("textscan strings");
        assert_eq!(text_column(&strings, 0), vec!["ab".to_string()]);
        assert_eq!(text_column(&strings, 1), vec!["cdef".to_string()]);

        let chars = block_on(textscan_builtin(
            Value::from("abc"),
            Value::from("%c%c%c"),
            Vec::new(),
        ))
        .expect("textscan chars");
        assert_eq!(text_column(&chars, 0), vec!["a".to_string()]);
        assert_eq!(text_column(&chars, 1), vec!["b".to_string()]);
        assert_eq!(text_column(&chars, 2), vec!["c".to_string()]);
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_collect_output_preserves_column_major_numeric_group() {
        let out = block_on(textscan_builtin(
            Value::from("1 2\n3 4\n"),
            Value::from("%f %f"),
            vec![Value::from("CollectOutput"), Value::Bool(true)],
        ))
        .expect("textscan");
        let Value::Tensor(group) = output_value(&out, 0) else {
            panic!("expected collected numeric group");
        };
        assert_eq!(group.shape, vec![2, 2]);
        assert_eq!(group.data, vec![1.0, 3.0, 2.0, 4.0]);
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_repeated_file_read_preserves_next_record_position() {
        let _guard = registry::test_guard();
        registry::reset_for_tests();
        let mut path = std::env::temp_dir();
        path.push("runmat_textscan_file_position.txt");
        std::fs::write(&path, "10 ten\n20 twenty\n").expect("write fixture");

        let mut options = OpenOptions::new();
        options.read(true);
        let file = block_on(options.open_async(&path)).expect("open file");
        let handle = Arc::new(StdMutex::new(Some(file)));
        let fid = registry::register_file(RegisteredFile {
            path: path.clone(),
            permission: "r".to_string(),
            machinefmt: "native".to_string(),
            encoding: "UTF-8".to_string(),
            handle: handle.clone(),
        });

        let out = block_on(textscan_builtin(
            Value::Num(fid as f64),
            Value::from("%f %s"),
            vec![Value::Num(1.0)],
        ))
        .expect("textscan");
        assert_eq!(numeric_column(&out, 0), vec![10.0]);
        assert_eq!(text_column(&out, 1), vec!["ten".to_string()]);

        let mut remaining = String::new();
        let mut guard = handle.lock().expect("lock");
        let file = guard.as_mut().expect("file");
        std::io::Read::read_to_string(file, &mut remaining).expect("read remaining");
        assert_eq!(remaining, "20 twenty\n");

        let _ = registry::close(fid);
        let _ = std::fs::remove_file(path);
        registry::reset_for_tests();
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_shift_jis_registered_file_restores_source_byte_position() {
        shift_jis_registered_file_restores_source_byte_position(
            "shift_jis",
            &[
                b'1', b' ', 0x82, 0xA0, b'\n', b'2', b' ', b'n', b'e', b'x', b't', b'\n',
            ],
            "あ",
        );
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_shift_jis_alias_restores_source_byte_position() {
        shift_jis_registered_file_restores_source_byte_position(
            "windows-31j",
            &[
                b'1', b' ', 0x82, 0xA0, b'\n', b'2', b' ', b'n', b'e', b'x', b't', b'\n',
            ],
            "あ",
        );
        assert!(is_shift_jis_encoding("ms932"));
        assert!(is_shift_jis_encoding("x-sjis"));
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_shift_jis_0x80_advances_one_source_byte() {
        shift_jis_registered_file_restores_source_byte_position(
            "ms932",
            &[
                b'1', b' ', 0x80, b'\n', b'2', b' ', b'n', b'e', b'x', b't', b'\n',
            ],
            "\u{80}",
        );
    }

    fn shift_jis_registered_file_restores_source_byte_position(
        encoding: &str,
        bytes: &[u8],
        expected_text: &str,
    ) {
        let _guard = registry::test_guard();
        registry::reset_for_tests();
        let mut path = std::env::temp_dir();
        path.push(format!(
            "runmat_textscan_shift_jis_position_{}.txt",
            encoding.replace('-', "_")
        ));
        std::fs::write(&path, bytes).expect("write fixture");

        let mut options = OpenOptions::new();
        options.read(true);
        let file = block_on(options.open_async(&path)).expect("open file");
        let handle = Arc::new(StdMutex::new(Some(file)));
        let fid = registry::register_file(RegisteredFile {
            path: path.clone(),
            permission: "r".to_string(),
            machinefmt: "native".to_string(),
            encoding: encoding.to_string(),
            handle: handle.clone(),
        });

        let out = block_on(textscan_builtin(
            Value::Num(fid as f64),
            Value::from("%f %s"),
            vec![Value::Num(1.0)],
        ))
        .expect("textscan");
        assert_eq!(numeric_column(&out, 0), vec![1.0]);
        assert_eq!(text_column(&out, 1), vec![expected_text.to_string()]);

        let mut remaining = Vec::new();
        let mut guard = handle.lock().expect("lock");
        let file = guard.as_mut().expect("file");
        std::io::Read::read_to_end(file, &mut remaining).expect("read remaining");
        assert_eq!(remaining, b"2 next\n");

        let _ = registry::close(fid);
        let _ = std::fs::remove_file(path);
        registry::reset_for_tests();
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_rejects_standard_stream_identifiers() {
        let err = block_on(textscan_builtin(
            Value::Num(0.0),
            Value::from("%f"),
            Vec::new(),
        ))
        .expect_err("standard stream rejected");
        assert!(err.message().contains("standard input/output"));
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn textscan_reads_from_registered_file_identifier() {
        let _guard = registry::test_guard();
        registry::reset_for_tests();
        let mut path = std::env::temp_dir();
        path.push("runmat_textscan_registered_file.txt");
        std::fs::write(&path, "skip\n10 ten\n20 twenty\n").expect("write fixture");

        let mut options = OpenOptions::new();
        options.read(true);
        let file = block_on(options.open_async(&path)).expect("open file");
        let handle = Arc::new(StdMutex::new(Some(file)));
        let fid = registry::register_file(RegisteredFile {
            path: path.clone(),
            permission: "r".to_string(),
            machinefmt: "native".to_string(),
            encoding: "UTF-8".to_string(),
            handle,
        });

        let out = block_on(textscan_builtin(
            Value::Num(fid as f64),
            Value::from("%f %s"),
            vec![Value::from("HeaderLines"), Value::Num(1.0)],
        ))
        .expect("textscan");
        assert_eq!(numeric_column(&out, 0), vec![10.0, 20.0]);
        assert_eq!(
            text_column(&out, 1),
            vec!["ten".to_string(), "twenty".to_string()]
        );

        let _ = registry::close(fid);
        let _ = std::fs::remove_file(path);
        registry::reset_for_tests();
    }
}