inkferro-core 0.1.0

//! Port of [`input-parser.ts`](../../../ink/src/input-parser.ts): the stateful,
//! pure segmentation layer.
//!
//! It splits a raw byte stream into [`Segment`]s — opaque text/key byte runs and
//! decoded paste payloads — handling escape-sequence boundary detection,
//! bracketed paste, backspace splitting, and partial-sequence buffering across
//! chunks. It never decodes a key into a [`Key`](super::keypress::Key); that is
//! the next layer up.
//!
//! # Byte vs UTF-16 note
//!
//! Upstream operates on a UTF-16 JS string and uses `codePointAt` + string
//! indices. This port operates on `&[u8]` with byte indices. Two spots depend
//! on codepoint length:
//!
//! - `parseEscapedCodePoint` advances past the codepoint following `ESC`. JS
//!   uses UTF-16 units (`> 0xffff ? 2 : 1`); this port advances by the **UTF-8
//!   byte length** of that codepoint (1–4). Both denote "the one codepoint
//!   after ESC", so the segment boundaries agree (the `😀` test pins
//!   this: 😀 is 4 UTF-8 bytes).
//! - Backspace splitting keys off `0x7F`/`0x08`, both ASCII single bytes that
//!   cannot occur inside a multi-byte UTF-8 sequence, so byte-index slicing is
//!   boundary-safe.
//!
//! All structural bytes the segmenter keys off (ESC `0x1B`, `[`, `O`, CSI
//! parameter/intermediate/final bytes, paste markers, backspace) are ASCII.

const ESC: u8 = 0x1b;
const PASTE_START: &[u8] = b"\x1b[200~";
const PASTE_END: &[u8] = b"\x1b[201~";

/// A segment of input: an opaque byte run (plain text or one key sequence), or
/// a decoded bracketed-paste payload.
///
/// Mirrors `InputEvent = string | {paste: string}` from `input-parser.ts`,
/// where the string arm is the raw sequence (later fed to `parse_keypress`).
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Segment {
    /// Raw bytes of a plain-text run or a single key/escape sequence.
    Bytes(Vec<u8>),
    /// A bracketed-paste payload, delivered verbatim.
    Paste(Vec<u8>),
}

fn is_csi_parameter_byte(byte: u8) -> bool {
    (0x30..=0x3f).contains(&byte)
}

fn is_csi_intermediate_byte(byte: u8) -> bool {
    (0x20..=0x2f).contains(&byte)
}

fn is_csi_final_byte(byte: u8) -> bool {
    (0x40..=0x7e).contains(&byte)
}

/// Result of attempting to parse one escape/control sequence.
enum Parsed {
    /// A complete sequence: `[start_index, next_index)`.
    Sequence { next_index: usize },
    /// More bytes are needed to decide.
    Pending,
    /// Not a recognized control sequence here.
    None,
}

/// `parseCsiSequence`: scan a CSI body for its final byte.
fn parse_csi_sequence(input: &[u8], start_index: usize, prefix_length: usize) -> Parsed {
    let csi_payload_start = start_index + prefix_length + 1;
    let mut index = csi_payload_start;
    while index < input.len() {
        let byte = input[index];

        if is_csi_parameter_byte(byte) || is_csi_intermediate_byte(byte) {
            index += 1;
            continue;
        }

        // Preserve legacy terminal function-key sequences like ESC[[A and ESC[[5~.
        if byte == 0x5b && index == csi_payload_start {
            index += 1;
            continue;
        }

        if is_csi_final_byte(byte) {
            return Parsed::Sequence {
                next_index: index + 1,
            };
        }

        return Parsed::None;
    }

    Parsed::Pending
}

/// `parseSs3Sequence`: an SS3 sequence is `ESC O <final>` — a single byte after
/// the `O`.
fn parse_ss3_sequence(input: &[u8], start_index: usize, prefix_length: usize) -> Parsed {
    let next_index = start_index + prefix_length + 2;
    if next_index > input.len() {
        return Parsed::Pending;
    }

    let final_byte = input[next_index - 1];
    if !is_csi_final_byte(final_byte) {
        return Parsed::None;
    }

    Parsed::Sequence { next_index }
}

/// `parseControlSequence`: dispatch on the byte after the escape prefix.
fn parse_control_sequence(input: &[u8], start_index: usize, prefix_length: usize) -> Parsed {
    let Some(&sequence_type) = input.get(start_index + prefix_length) else {
        return Parsed::Pending;
    };

    if sequence_type == b'[' {
        return parse_csi_sequence(input, start_index, prefix_length);
    }

    if sequence_type == b'O' {
        return parse_ss3_sequence(input, start_index, prefix_length);
    }

    Parsed::None
}

/// `parseEscapedCodePoint`: ESC followed by a single (non-control) codepoint.
/// Advances by the UTF-8 byte length of that codepoint.
///
/// Returns `Pending` when the byte after ESC is a valid UTF-8 lead byte whose
/// continuation bytes have not yet all arrived, so the partial codepoint is
/// buffered for the next chunk rather than emitted as `ESC + lone byte`. This
/// streaming case has no upstream oracle (the TS parser only ever sees whole
/// UTF-16 strings); buffering is the strictly-correct choice for a byte stream.
fn parse_escaped_code_point(input: &[u8], escape_index: usize) -> ParsedEscape {
    match utf8_codepoint_len(input, escape_index + 1) {
        Some(cp_len) => ParsedEscape::Sequence {
            next_index: escape_index + 1 + cp_len,
        },
        None => ParsedEscape::Pending,
    }
}

/// Number of bytes occupied by the UTF-8 codepoint starting at `index`.
///
/// - `Some(1)` for an out-of-range index or a byte that is not a valid UTF-8
///   lead (a stray continuation/invalid byte → lossy single replacement scalar,
///   matching `TextDecoder`).
/// - `Some(n)` (2–4) when the full codepoint is present and well-formed.
/// - `None` when the lead is a valid multibyte lead but the continuation bytes
///   have not all arrived yet (partial codepoint — buffer and wait).
fn utf8_codepoint_len(input: &[u8], index: usize) -> Option<usize> {
    let &lead = input.get(index)?;
    let expected = if lead < 0x80 {
        1
    } else if lead >> 5 == 0b110 {
        2
    } else if lead >> 4 == 0b1110 {
        3
    } else if lead >> 3 == 0b11110 {
        4
    } else {
        // Not a valid UTF-8 lead byte: lone replacement scalar, length 1.
        return Some(1);
    };
    // Check continuation bytes: if a required byte is missing, the codepoint is
    // not yet complete -> pending. If present but malformed, fall back to a
    // single-byte lossy scalar (TextDecoder behavior).
    for offset in 1..expected {
        match input.get(index + offset) {
            Some(&b) if b >> 6 == 0b10 => {}
            Some(_) => return Some(1),
            None => return None,
        }
    }
    Some(expected)
}

/// Result of `parseEscapeSequence`: complete (with `next_index`) or pending.
enum ParsedEscape {
    Sequence { next_index: usize },
    Pending,
}

/// `parseEscapeSequence`: starting at an ESC byte, determine the full escape
/// sequence boundary.
fn parse_escape_sequence(input: &[u8], escape_index: usize) -> ParsedEscape {
    if escape_index == input.len() - 1 {
        return ParsedEscape::Pending;
    }

    let next = input[escape_index + 1];
    if next == ESC {
        if escape_index + 2 >= input.len() {
            return ParsedEscape::Pending;
        }

        match parse_control_sequence(input, escape_index, 2) {
            Parsed::Pending => return ParsedEscape::Pending,
            Parsed::Sequence { next_index } => return ParsedEscape::Sequence { next_index },
            Parsed::None => {}
        }

        // Double escape, non-control: emit the two ESC bytes as one segment.
        return ParsedEscape::Sequence {
            next_index: escape_index + 2,
        };
    }

    match parse_control_sequence(input, escape_index, 1) {
        Parsed::Pending => ParsedEscape::Pending,
        Parsed::Sequence { next_index } => ParsedEscape::Sequence { next_index },
        Parsed::None => parse_escaped_code_point(input, escape_index),
    }
}

/// `splitBackspaceBytes`: split a non-escape text run so that backspace bytes
/// (`0x7F` and `0x08`) become individual segments. Other control characters
/// (`\r`, `\t`) are NOT split because they can appear inside pasted text.
fn split_backspace_bytes(text: &[u8], events: &mut Vec<Segment>) {
    let mut text_segment_start = 0;

    for index in 0..text.len() {
        let character = text[index];
        if character == 0x7f || character == 0x08 {
            if index > text_segment_start {
                events.push(Segment::Bytes(text[text_segment_start..index].to_vec()));
            }
            events.push(Segment::Bytes(vec![character]));
            text_segment_start = index + 1;
        }
    }

    if text_segment_start < text.len() {
        events.push(Segment::Bytes(text[text_segment_start..].to_vec()));
    }
}

/// Find the first occurrence of `needle` in `haystack[from..]`, returning an
/// absolute index.
fn find_from(haystack: &[u8], needle: &[u8], from: usize) -> Option<usize> {
    if needle.is_empty() || from > haystack.len() {
        return None;
    }
    haystack[from..]
        .windows(needle.len())
        .position(|w| w == needle)
        .map(|p| p + from)
}

/// `parseKeypresses`: the core pass. Returns the events and any trailing pending
/// bytes that could not yet be decided.
fn parse_keypresses(input: &[u8]) -> (Vec<Segment>, Vec<u8>) {
    let mut events: Vec<Segment> = Vec::new();
    let mut index = 0;

    while index < input.len() {
        let escape_index = match find_from(input, &[ESC], index) {
            None => {
                split_backspace_bytes(&input[index..], &mut events);
                return (events, Vec::new());
            }
            Some(i) => i,
        };

        if escape_index > index {
            split_backspace_bytes(&input[index..escape_index], &mut events);
        }

        let parsed = parse_escape_sequence(input, escape_index);
        let next_index = match parsed {
            ParsedEscape::Pending => return (events, input[escape_index..].to_vec()),
            ParsedEscape::Sequence { next_index } => next_index,
        };
        let sequence = &input[escape_index..next_index];

        if sequence == PASTE_START {
            let after_start = next_index;
            match find_from(input, PASTE_END, after_start) {
                None => return (events, input[escape_index..].to_vec()),
                Some(end_index) => {
                    events.push(Segment::Paste(input[after_start..end_index].to_vec()));
                    index = end_index + PASTE_END.len();
                    continue;
                }
            }
        }

        events.push(Segment::Bytes(sequence.to_vec()));
        index = next_index;
    }

    (events, Vec::new())
}

/// Stateful segmenter mirroring `createInputParser`. Buffers partial sequences
/// across `push` calls.
#[derive(Debug, Default, Clone)]
pub struct Segmenter {
    pending: Vec<u8>,
}

impl Segmenter {
    /// `push(chunk)`: feed a chunk of raw bytes; returns the segments decided so
    /// far. Undecided trailing bytes are buffered for the next call.
    pub fn push(&mut self, chunk: &[u8]) -> Vec<Segment> {
        let mut combined = std::mem::take(&mut self.pending);
        combined.extend_from_slice(chunk);
        let (events, pending) = parse_keypresses(&combined);
        self.pending = pending;
        events
    }

    /// `hasPendingEscape()`: whether a bare/partial escape is buffered (used by
    /// the host's escape-flush timer). False while assembling a paste-start
    /// marker or waiting for paste end.
    pub fn has_pending_escape(&self) -> bool {
        self.pending.first() == Some(&ESC)
            && !self.pending.starts_with(PASTE_START)
            && self.pending.as_slice() != b"\x1b[200"
    }

    /// `flushPendingEscape()`: take the buffered escape bytes as literal input,
    /// or `None` if no escape is pending.
    pub fn flush_pending_escape(&mut self) -> Option<Vec<u8>> {
        if self.pending.first() != Some(&ESC) {
            return None;
        }
        Some(std::mem::take(&mut self.pending))
    }

    /// `reset()`: clear pending input state.
    pub fn reset(&mut self) {
        self.pending.clear();
    }
}

#[cfg(test)]
mod tests;