use std;
#[cfg(feature = "regex")]
use regex::Regex;
#[derive(Debug, PartialEq)]
enum FmtType {
NonWhitespaceOrEnd,
Pattern,
Dec10,
Hex16,
Flt,
#[cfg(feature = "regex")]
Regex,
}
use std::{error::Error, fmt};
#[derive(Debug, PartialEq)]
pub struct ScanError(pub String);
impl Error for ScanError {}
impl fmt::Display for ScanError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Scan error: {}", self.0)
}
}
struct VecScanner {
data: Vec<char>,
pos: usize,
limit_pos: usize, }
impl VecScanner {
fn new(d: Vec<char>) -> VecScanner {
VecScanner {
data: d,
pos: 0,
limit_pos: 0,
}
}
fn cur(&self) -> char {
self.data[self.pos]
}
fn peek(&self, n: usize) -> Option<char> {
if self.pos + n < self.data.len() {
Some(self.data[self.pos + n])
} else {
None
}
}
fn is_end(&self) -> bool {
self.pos >= self.data.len()
}
fn inc(&mut self) -> bool {
self.pos += 1;
!self.is_end()
}
fn start_inc_limit(&mut self, max_length: Option<usize>) {
match max_length {
Some(n) => {
self.limit_pos = self.pos + n;
}
None => {
self.limit_pos = 0;
}
}
}
fn hit_inc_limit(&mut self) -> bool {
self.limit_pos > 0 && self.pos >= self.limit_pos
}
fn inc_limit(&mut self) -> bool {
self.pos += 1;
!(self.is_end() || self.hit_inc_limit())
}
}
fn is_whitespace(c: char) -> bool {
match c {
' ' | '\t' | '\n' | '\r' => true,
_ => false,
}
}
fn skip_whitespace(vs: &mut VecScanner) -> bool {
while !vs.is_end() {
if is_whitespace(vs.cur()) {
vs.inc();
} else {
break;
}
}
!vs.is_end()
}
struct FmtResult {
data_type: FmtType,
max_length: Option<usize>,
store_result: bool,
invert_char_list: bool,
end_char: char,
char_list: Vec<(char, char)>,
#[cfg(feature = "regex")]
regex: Option<Regex>,
}
fn get_format(fstr: &mut VecScanner) -> Option<FmtResult> {
let mut res = FmtResult {
data_type: FmtType::NonWhitespaceOrEnd,
max_length: None,
end_char: ' ',
store_result: true,
invert_char_list: false,
char_list: vec![],
#[cfg(feature = "regex")]
regex: None,
};
if fstr.cur() == '*' {
res.store_result = false;
if !fstr.inc() {
return None;
}
}
if fstr.cur() == '}' {
if fstr.inc() {
res.end_char = fstr.cur();
}
return Some(res);
}
let pos_start = fstr.pos;
while fstr.cur().is_digit(10) {
if !fstr.inc() {
return None;
}
}
if fstr.pos > pos_start {
let max_length_string: String = fstr.data[pos_start..fstr.pos].iter().cloned().collect();
res.max_length = max_length_string.parse::<usize>().ok();
}
match fstr.cur() {
's' => { }
'd' => {
res.data_type = FmtType::Dec10;
}
'x' => {
res.data_type = FmtType::Hex16;
}
'f' => {
res.data_type = FmtType::Flt;
}
'[' => {
res.data_type = FmtType::Pattern;
}
#[cfg(feature = "regex")]
'/' => {
res.data_type = FmtType::Regex;
}
_ => return None, }
if !fstr.inc() {
return None;
}
match res.data_type {
FmtType::Pattern => handle_pattern(res, fstr),
#[cfg(feature = "regex")]
FmtType::Regex => handle_regex(res, fstr),
_ => {
if fstr.cur() != '}' {
return None;
}
fstr.inc();
Some(res)
}
}
}
fn handle_pattern(mut res: FmtResult, fstr: &mut VecScanner) -> Option<FmtResult> {
res.data_type = FmtType::Pattern;
if fstr.cur() == '^' {
res.invert_char_list = true;
if !fstr.inc() {
return None;
}
}
match fstr.cur() {
']' | '-' => {
res.char_list.push((fstr.cur(), fstr.cur()));
if !fstr.inc() {
return None;
}
}
_ => (),
}
while fstr.cur() != ']' {
if fstr.peek(1) == Some('-') && fstr.peek(2) != Some(']') {
let prev_char = fstr.cur();
if !fstr.inc() {
break;
} if !fstr.inc() {
break;
} res.char_list.push((prev_char, fstr.cur()));
} else {
res.char_list.push((fstr.cur(), fstr.cur()));
}
if !fstr.inc() {
return None;
}
}
if !fstr.inc() {
return None;
} if fstr.cur() != '}' {
return None;
}
fstr.inc();
Some(res)
}
#[cfg(feature = "regex")]
fn handle_regex(mut res: FmtResult, fstr: &mut VecScanner) -> Option<FmtResult> {
let start = fstr.pos;
let mut last_was_escape = false;
while fstr.inc() {
if fstr.cur() == '/' && !last_was_escape {
break;
}
if fstr.cur() == '\\' {
last_was_escape = true;
} else {
last_was_escape = false;
}
}
if fstr.cur() != '/' {
return None;
}
let substr = Some('^')
.into_iter()
.chain(fstr.data[start..fstr.pos].iter().cloned())
.collect::<String>();
if let Ok(re) = Regex::new(&substr) {
res.regex = Some(re);
} else {
return None;
}
fstr.inc();
if fstr.cur() != '}' {
return None;
}
fstr.inc();
Some(res)
}
fn scan_dec10(vs: &mut VecScanner, max_length: Option<usize>) {
vs.start_inc_limit(max_length);
scan_dec10_nest(vs);
}
fn scan_dec10_nest(vs: &mut VecScanner) {
match vs.cur() {
'+' | '-' => {
if !vs.inc_limit() {
return;
}
}
_ => (),
}
while vs.cur().is_digit(10) {
if !vs.inc_limit() {
return;
}
}
}
fn scan_hex16(vs: &mut VecScanner, max_length: Option<usize>) {
vs.start_inc_limit(max_length);
if vs.cur() == '0' {
if !vs.inc_limit() {
return;
}
}
if vs.cur() == 'x' {
if !vs.inc_limit() {
return;
}
}
while vs.cur().is_digit(16) {
if !vs.inc_limit() {
return;
};
}
}
fn scan_float(vs: &mut VecScanner, max_length: Option<usize>) {
vs.start_inc_limit(max_length);
scan_dec10_nest(vs);
if vs.cur() == '.' {
if !vs.inc_limit() {
return;
}
while vs.cur().is_digit(10) {
if !vs.inc_limit() {
return;
}
}
}
if vs.cur() == 'e' {
if !vs.inc_limit() {
return;
}
scan_dec10_nest(vs);
}
}
fn scan_nonws_or_end(vs: &mut VecScanner, end: char) {
while !is_whitespace(vs.cur()) && vs.cur() != end {
if !vs.inc() {
return;
}
}
}
fn scan_pattern(vs: &mut VecScanner, fmt: &mut FmtResult) {
loop {
let c = vs.cur();
let mut found = false;
for &(start, end) in fmt.char_list.iter() {
if c >= start && c <= end {
found = true;
break;
}
}
if found == fmt.invert_char_list {
return;
}
if !vs.inc() {
return;
}
}
}
#[cfg(feature = "regex")]
enum ReMatch {
Captured { len: usize },
NoCapture,
}
#[cfg(feature = "regex")]
fn scan_regex(vs: &mut VecScanner, fmt: &mut FmtResult) -> ReMatch {
let re = fmt.regex.take().unwrap();
let remainder = vs.data[vs.pos..].iter().cloned().collect::<String>();
if let Some(mat) = re.captures(&remainder) {
vs.pos += mat.get(0).unwrap().end();
if let Some(cap) = mat.get(1) {
return ReMatch::Captured { len: cap.end() };
}
}
return ReMatch::NoCapture;
}
fn get_token(vs: &mut VecScanner, fmt: &mut FmtResult) -> String {
let mut pos_start = vs.pos;
match fmt.data_type {
FmtType::NonWhitespaceOrEnd => scan_nonws_or_end(vs, fmt.end_char),
FmtType::Dec10 => scan_dec10(vs, fmt.max_length),
FmtType::Hex16 => scan_hex16(vs, fmt.max_length),
FmtType::Flt => scan_float(vs, fmt.max_length),
FmtType::Pattern => scan_pattern(vs, fmt),
#[cfg(feature = "regex")]
FmtType::Regex => {
match scan_regex(vs, fmt) {
ReMatch::Captured { len } => {
return vs.data[pos_start..pos_start + len]
.iter()
.cloned()
.collect();
}
ReMatch::NoCapture => {}
}
}
}
if fmt.data_type == FmtType::Dec10 || fmt.data_type == FmtType::Flt {
if vs.data[pos_start] == '+' {
pos_start += 1;
}
}
vs.data[pos_start..vs.pos].iter().cloned().collect()
}
pub fn scan(input_string: &str, format: &str) -> std::vec::IntoIter<String> {
let mut res: Vec<String> = vec![];
let mut fmtstr = VecScanner::new(format.chars().collect());
let mut instr = VecScanner::new(input_string.chars().collect());
loop {
let mut do_compare = true;
if !skip_whitespace(&mut fmtstr) {
break;
}
if !skip_whitespace(&mut instr) {
break;
}
if fmtstr.cur() == '{' {
if !fmtstr.inc() {
break;
}
if fmtstr.cur() == '{' {
} else {
let fmt = get_format(&mut fmtstr);
if !fmt.is_some() {
break;
}
let mut fmt = fmt.unwrap();
let data = get_token(&mut instr, &mut fmt);
if fmt.store_result {
if fmt.data_type == FmtType::Hex16 {
let no_prefix = data.trim_start_matches("0x");
res.push(no_prefix.to_string());
} else {
res.push(data);
}
}
do_compare = false;
}
} else {
if fmtstr.cur() == '}' {
if !fmtstr.inc() {
break;
}
}
}
if do_compare {
if fmtstr.cur() != instr.cur() {
break;
}
if !fmtstr.inc() {
break;
}
if !instr.inc() {
break;
}
}
}
res.into_iter()
}
#[test]
fn test_simple() {
let mut res = scan(" data 42-12=30", "data {d}-{d}={d}");
assert_eq!(res.next().unwrap(), "42");
assert_eq!(res.next().unwrap(), "12");
assert_eq!(res.next().unwrap(), "30");
assert_eq!(res.next(), None);
}
#[test]
fn test_plus_sign() {
let mut res = scan("+42", "{d}");
assert_eq!(res.next().unwrap(), "42");
let mut res = scan("+42.7", "{f}");
assert_eq!(res.next().unwrap(), "42.7");
}
#[test]
fn test_complex() {
let mut res = scan(
"test{123 bye -456} hi -22.7e-1 +1.23fg",
"test{{{d} bye {}}} hi {f} {f}",
);
assert_eq!(res.next().unwrap(), "123");
assert_eq!(res.next().unwrap(), "-456");
assert_eq!(res.next().unwrap(), "-22.7e-1");
assert_eq!(res.next().unwrap(), "1.23");
assert_eq!(res.next(), None);
}
#[test]
fn test_endline() {
let mut res = scan("hi 15.7\r\n", "{} {}");
assert_eq!(res.next().unwrap(), "hi");
assert_eq!(res.next().unwrap(), "15.7");
}
#[test]
fn test_hex() {
let mut res = scan("hi 0x15 ff fg", "hi {x} {x} {x}");
assert_eq!(res.next().unwrap(), "15");
assert_eq!(res.next().unwrap(), "ff");
assert_eq!(res.next().unwrap(), "f");
}
#[test]
fn test_string() {
let mut res = scan("The quick brown fox", "{s}{s} {}n {s}x");
assert_eq!(res.next().unwrap(), "The");
assert_eq!(res.next().unwrap(), "quick");
assert_eq!(res.next().unwrap(), "brow");
assert_eq!(res.next().unwrap(), "fox");
}
#[test]
fn test_pattern() {
let mut res = scan(
"hi abcdefghijklmnop 0123456789",
"hi {[a-l]}{[^a-l ]} {[01234-8]}{[9]}",
);
assert_eq!(res.next().unwrap(), "abcdefghijkl");
assert_eq!(res.next().unwrap(), "mnop");
assert_eq!(res.next().unwrap(), "012345678");
assert_eq!(res.next().unwrap(), "9");
let mut res = scan("xyz 01234567λ89", "xyz {[40-3]}{*[65]}{[7-78-9λ]}");
assert_eq!(res.next().unwrap(), "01234");
assert_eq!(res.next().unwrap(), "7λ89");
}
#[test]
fn test_width() {
let mut res = scan("01123fe071 432", "{2d}{3d}{4x}{2d} {3d}");
assert_eq!(res.next().unwrap(), "01");
assert_eq!(res.next().unwrap(), "123");
assert_eq!(res.next().unwrap(), "fe07");
assert_eq!(res.next().unwrap(), "1");
assert_eq!(res.next().unwrap(), "432");
}
#[cfg(test)]
mod test_regex {
use super::scan;
#[test]
fn simple() {
let mut res = scan("one (hello) two", "one ({/[^)]+/}) two");
assert_eq!(res.next().unwrap(), "hello");
}
#[test]
fn mixed_regex_and_pattern() {
let mut res = scan("one ((hello)) two", r#"one ({/[^)]+\)?/}) two"#);
assert_eq!(res.next().unwrap(), "(hello)");
}
#[test]
fn bad_pattern() {
let mut scanner = scan("one (hello)) two", "one ({/[^)]+/}) two");
assert_eq!(scanner.next().unwrap(), "hello");
if let Some(v) = scanner.next() {
println!("got something unexpected on second iter: {:?}", v);
}
}
#[test]
fn uses_group_if_present() {
let mut res = scan("one (((hello))) two", r#"one {/(\(.*\)) /}two"#);
assert_eq!(res.next().unwrap(), "(((hello)))");
}
}