regress 0.2.0

A regular expression engine targeting EcmaScript syntax
Documentation
use std::env;
use std::io;
use std::io::BufRead;
use std::process;

type CodePoint = u32;

#[derive(Debug, Copy, Clone)]
struct FoldPair {
    orig: CodePoint,
    folded: CodePoint,
}

impl FoldPair {
    fn delta(self) -> i32 {
        (self.folded as i32) - (self.orig as i32)
    }

    fn stride_to(self, rhs: FoldPair) -> u32 {
        rhs.orig - self.orig
    }
}

struct DeltaBlock {
    /// Folds original -> folded.
    folds: Vec<FoldPair>,
}

impl DeltaBlock {
    fn create(fp: FoldPair) -> DeltaBlock {
        DeltaBlock { folds: vec![fp] }
    }

    fn stride(&self) -> Option<u32> {
        if self.folds.len() >= 2 {
            Some(self.folds[0].stride_to(self.folds[1]))
        } else {
            None
        }
    }

    fn first(&self) -> FoldPair {
        *self.folds.first().unwrap()
    }

    fn last(&self) -> FoldPair {
        *self.folds.last().unwrap()
    }

    fn length(&self) -> usize {
        (self.last().orig as usize) - (self.first().orig as usize) + 1
    }

    fn delta(&self) -> i32 {
        self.first().delta()
    }

    #[allow(clippy::if_same_then_else)]
    fn can_append(&self, fp: FoldPair) -> bool {
        if self.folds.is_empty() {
            // New block.
            true
        } else if fp.orig - self.first().orig >= 256 {
            // Length would be too big.
            false
        } else if self.delta() != fp.delta() {
            // Different deltas in this block.
            false
        } else if let Some(stride) = self.stride() {
            // Strides must match.
            stride == self.last().stride_to(fp)
        } else {
            // No stride yet.
            true
        }
    }

    fn append(&mut self, fp: FoldPair) {
        std::debug_assert!(self.can_append(fp));
        self.folds.push(fp)
    }
}

fn create_delta_blocks(fps: &[FoldPair]) -> Vec<DeltaBlock> {
    let mut blocks: Vec<DeltaBlock> = Vec::new();
    for &fp in fps {
        match blocks.last_mut() {
            Some(ref mut db) if db.can_append(fp) => db.append(fp),
            _ => blocks.push(DeltaBlock::create(fp)),
        }
    }
    blocks
}

fn format_delta_blocks(dbs: &[DeltaBlock]) -> String {
    let format_db = |db: &DeltaBlock| -> String {
        format!(
            "fr({start:#04X}, {length}, {delta}, {modulo})",
            start = db.first().orig,
            length = db.length(),
            delta = db.delta(),
            modulo = db.stride().unwrap_or(1),
        )
    };

    let mut lines = Vec::new();
    for dbc in dbs.chunks(4) {
        lines.push(
            dbc.iter()
                .map(format_db)
                .collect::<Vec<String>>()
                .join(", ")
                + ",",
        );
    }

    let prefix = r#"
/// This file is autogenerated from gen_folds.rs and CaseFolding.txt.

pub struct FoldRange {
    /// The first codepoint in the range.
    pub start:u32,

    /// The length of the range, in code points.
    pub length:u8,

    /// The (signed) delta amount.
    /// Folds are performed by adding this (signed) value to a code point.
    pub delta:i32,

    /// The modulo amount.
    /// Folds are only performed if the code point is a multiple of this value.
    pub modulo:u8,
}

const fn fr(start: u32, length: u8, delta: i32, modulo: u8) -> FoldRange {
    FoldRange {start, length, delta, modulo}
}
"#;

    format!(
        r#"
{prefix}

pub static FOLDS: [FoldRange; {count}] = [
    {lines}
];
"#,
        prefix = prefix.trim(),
        count = dbs.len(),
        lines = lines.join("\n    ")
    )
}

/// Parse a CaseFolding line if it is of Common type.
/// Example line: "0051; C; 0071; # LATIN CAPITAL LETTER Q"
fn process_simple_fold(s: &str) -> Option<FoldPair> {
    // Trim trailing #s which are comments.
    if let Some(s) = s.trim().split('#').next() {
        let fields: Vec<&str> = s.split(';').map(str::trim).collect();
        if fields.len() != 4 {
            return None;
        }
        let status = fields[1];
        if status != "C" && status != "S" {
            return None;
        }
        let from_hex = |s: &str| u32::from_str_radix(s, 16).unwrap();
        let (orig, folded) = (from_hex(fields[0]), from_hex(fields[2]));
        return Some(FoldPair { orig, folded });
    }
    None
}

fn print_usage_and_exit() {
    println!("Usage: curl -L ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt | gen_folds > foldtable.rs");
    process::exit(1)
}

fn main() {
    let argv: Vec<String> = env::args().collect();
    if argv.len() >= 2 {
        print_usage_and_exit();
    }
    let stdin = io::stdin();
    let mut foldpairs = Vec::new();
    for line in stdin.lock().lines() {
        let s: String = line.unwrap();
        if let Some(s) = s.as_str().trim().split('#').next() {
            if let Some(fp) = process_simple_fold(s) {
                foldpairs.push(fp);
            }
        }
    }
    let dblocks = create_delta_blocks(&foldpairs);
    println!("{}", format_delta_blocks(&dblocks).trim());
}