use std::collections::HashSet;
pub(crate) struct ByteIndexMap {
mapping: Vec<(u32, u32)>,
inserted_ranges_a: HashSet<u32>,
}
impl ByteIndexMap {
pub fn new() -> Self {
ByteIndexMap {
mapping: vec![],
inserted_ranges_a: HashSet::new(),
}
}
fn consistency_check(&self, s: &str) {
let mut prev_b: u32 = 0;
let mut prev_a: u32 = 0;
for (ix, (b, a)) in self.mapping.iter().enumerate() {
if b < &prev_b || a < &prev_a {
panic!(
"Inconsistent {}, ix {:?}, value {:?}, prev values {:?}",
s,
ix,
(b, a),
(prev_b, prev_a)
);
}
prev_b = *b;
prev_a = *a;
}
}
pub fn update(&mut self, start: u32, end: u32, len_b: u32) {
assert!(end >= start);
let start_end_range: Vec<u32> = (start..end).collect();
for i in start_end_range.iter() {
assert!(!self.inserted_ranges_a.contains(i), "Collision on {:?}", i);
self.inserted_ranges_a.insert(*i);
}
self.consistency_check("Before update");
let insert_ix = match self
.mapping
.iter()
.enumerate()
.find(|(_ix, (_pos_b, pos_a))| pos_a > &start)
{
Some((ix, (_, pos_a))) => {
assert!(end < *pos_a);
ix
}
None => self.mapping.len(),
};
let (pos_b, pos_a) = if insert_ix > 0 {
self.mapping[insert_ix - 1]
} else {
(0, 0)
};
assert!(start >= pos_a);
let delta_same = start - pos_a;
let new_a = end;
let new_b = pos_b + (delta_same + len_b);
assert!(new_a >= pos_a);
assert!(new_b >= pos_b);
self.mapping.insert(insert_ix, (new_b, new_a));
let mut prev_b: u32 = new_b;
let len_a = end - start;
for i in insert_ix + 1..self.mapping.len() {
let (b, a) = self.mapping[i];
let updated_b = b - len_a + len_b;
self.mapping[i] = (updated_b, a);
assert!(updated_b >= prev_b);
prev_b = updated_b;
}
self.consistency_check("After update");
}
pub fn resolve(&self, input_b: u32) -> u32 {
let ix = match self
.mapping
.iter()
.enumerate()
.find(|(_ix, (pos_b, _pos_a))| pos_b > &input_b)
{
Some((ix, _)) => ix,
None => self.mapping.len(),
};
let (pos_b, pos_a) = if ix > 0 { self.mapping[ix - 1] } else { (0, 0) };
pos_a + (input_b - pos_b)
}
}
pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) {
use regex::Regex;
let mut byte_index_map = ByteIndexMap::new();
let mut src: String = src.to_string();
let mut process_regex = |regex_expr: &str, replacement: &str| {
let mut byte_index_map_upds = vec![];
let reg = Regex::new(regex_expr).unwrap();
for captures in reg.captures_iter(&src) {
if let Some(mtch) = captures.get(0) {
let start = mtch.start() as u32;
let end = mtch.end() as u32;
let repl_length = replacement.len() as u32;
byte_index_map_upds.push((
byte_index_map.resolve(start),
byte_index_map.resolve(start) + end - start,
repl_length,
));
}
}
for (start, end, length) in byte_index_map_upds {
byte_index_map.update(start, end, length);
}
src = reg.replace_all(&src, replacement).to_string();
};
process_regex(r"\$\$[^\$]*\$\$", "LATEX_DOUBLE_DOLLAR_SUBSTITUTED");
process_regex(r"\$[^\$\n\r]*\$", "LATEX_SINGLE_DOLLAR_SUBSTITUTED");
process_regex(r"\\\([^\n\r]*\\\)", "LATEX_ESCAPED_PARENTHESIS_SUBSTITUTED");
process_regex(
r"\\\[(.|\r\n|\r|\n)*\\\]",
"LATEX_ESCAPED_SQUARE_BRACKET_SUBSTITUTED",
);
(src.to_string(), byte_index_map)
}