use crate::cmap_section::{CMapParseError, CMapSection, CodeLen, SourceCode};
use crate::parser::ParserInput;
use crate::parser::cmap_parser::parse;
use log::error;
use rangemap::RangeInclusiveMap;
use std::collections::HashMap;
use thiserror::Error;
#[derive(Debug, Default)]
pub struct ToUnicodeCMap {
pub bf_ranges: [RangeInclusiveMap<SourceCode, BfRangeTarget>; 4],
reverse_map: Option<HashMap<Vec<u16>, Vec<ReverseCMapEntry>>>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ReverseCMapEntry {
pub source_code: SourceCode,
pub code_len: CodeLen,
}
#[derive(Debug, Error)]
pub enum UnicodeCMapError {
#[error("could not parse ToUnicode CMap: {0:#?}")]
Parse(CMapParseError),
#[error("invalid code range")]
InvalidCodeRange,
}
impl From<CMapParseError> for UnicodeCMapError {
fn from(err: CMapParseError) -> Self {
UnicodeCMapError::Parse(err)
}
}
impl ToUnicodeCMap {
const REPLACEMENT_CHAR: u16 = 0xfffd;
pub fn new() -> ToUnicodeCMap {
ToUnicodeCMap {
bf_ranges: [(); 4].map(|_| RangeInclusiveMap::new()),
reverse_map: None,
}
}
pub(crate) fn parse(stream_content: Vec<u8>) -> Result<ToUnicodeCMap, UnicodeCMapError> {
let cmap_sections = parse(ParserInput::new_extra(&stream_content[..], "cmap"))?;
Self::from_sections(cmap_sections)
}
fn from_sections(cmap_sections: Vec<CMapSection>) -> Result<ToUnicodeCMap, UnicodeCMapError> {
let mut cmap = Self::new();
for section in cmap_sections {
match section {
CMapSection::CsRange(_) => (), CMapSection::BfChar(char_mappings) => {
for ((code, code_len), dst) in char_mappings {
cmap.put_char(code, code_len, dst);
}
}
CMapSection::BfRange(range_mappings) => {
for ((start, end, code_len), dst_vec) in range_mappings {
if end < start {
return Err(UnicodeCMapError::InvalidCodeRange);
}
match dst_vec.len() {
1 if dst_vec[0].len() == 1 => cmap.put(
start,
end,
code_len,
BfRangeTarget::UTF16CodePoint {
offset: u32::wrapping_sub(dst_vec[0][0] as u32, start),
},
),
1 => cmap.put(
start,
end,
code_len,
BfRangeTarget::HexString(dst_vec[0].clone()),
),
0 => return Err(UnicodeCMapError::InvalidCodeRange),
_ => cmap.put(
start,
end,
code_len,
BfRangeTarget::ArrayOfHexStrings(dst_vec.clone()),
),
}
}
}
}
}
let mut rev_map = HashMap::new();
for code_len_idx in 0..cmap.bf_ranges.len() {
let code_len = (code_len_idx + 1) as u8;
for (range, target) in cmap.bf_ranges[code_len_idx].iter() {
for src_code in range.clone() {
let unicode_sequence: Option<Vec<u16>> = match target {
BfRangeTarget::UTF16CodePoint { offset } => {
Some(vec![u32::wrapping_add(src_code, *offset) as u16])
}
BfRangeTarget::HexString(hex_str_vec) => {
if src_code == *range.start() {
Some(hex_str_vec.clone())
} else if hex_str_vec.len() == 1 {
Some(vec![
hex_str_vec[0].wrapping_add((src_code - range.start()) as u16),
])
} else if !hex_str_vec.is_empty() {
let mut current_hex_str = hex_str_vec.clone();
if let Some(last_val) = current_hex_str.last_mut() {
*last_val =
last_val.wrapping_add((src_code - range.start()) as u16);
Some(current_hex_str)
} else {
None
}
} else {
None
}
}
BfRangeTarget::ArrayOfHexStrings(array_of_hex_str) => {
let index = (src_code - range.start()) as usize;
if index < array_of_hex_str.len() {
Some(array_of_hex_str[index].clone())
} else {
None
}
}
};
if let Some(uni_seq) = unicode_sequence {
if !uni_seq.is_empty() {
rev_map.entry(uni_seq).or_insert_with(Vec::new).push(
ReverseCMapEntry {
source_code: src_code,
code_len,
},
);
}
}
}
}
}
cmap.reverse_map = Some(rev_map);
Ok(cmap)
}
pub fn get(&self, code: SourceCode, code_len: CodeLen) -> Option<Vec<u16>> {
if code_len > 4 || code_len == 0 {
error!("Code lenght should be between l and 4 bytes, got {code_len}");
return None;
}
use BfRangeTarget::*;
let bf_ranges_map = &self.bf_ranges[(code_len - 1) as usize];
bf_ranges_map
.get_key_value(&code)
.map(|(range, value)| match value {
HexString(vec) => {
let mut ret_vec = vec.clone();
*(ret_vec.last_mut().unwrap()) += (code - range.start()) as u16;
ret_vec
}
UTF16CodePoint { offset } => vec![u32::wrapping_add(code, *offset) as u16],
ArrayOfHexStrings(vec_of_strings) => {
vec_of_strings[(code - range.start()) as usize].clone()
}
})
}
pub fn get_or_replacement_char(&self, code: SourceCode, code_len: CodeLen) -> Vec<u16> {
self.get(code, code_len)
.unwrap_or(vec![ToUnicodeCMap::REPLACEMENT_CHAR])
}
pub fn put(
&mut self,
src_code_lo: SourceCode,
src_code_hi: SourceCode,
code_len: CodeLen,
target: BfRangeTarget,
) {
if code_len > 4 || code_len == 0 {
error!("Code lenght should be between l and 4 bytes, got {code_len}, ignoring");
return;
}
self.bf_ranges[(code_len - 1) as usize].insert(src_code_lo..=src_code_hi, target)
}
pub fn put_char(&mut self, code: SourceCode, code_len: CodeLen, dst: Vec<u16>) {
let target = if dst.len() == 1 {
BfRangeTarget::UTF16CodePoint {
offset: u32::wrapping_sub(dst[0] as u32, code),
}
} else {
BfRangeTarget::HexString(dst)
};
self.put(code, code, code_len, target)
}
pub fn get_source_codes_for_unicode(
&self,
unicode_sequence: &[u16],
) -> Option<&[ReverseCMapEntry]> {
if let Some(map) = &self.reverse_map {
map.get(unicode_sequence).map(|v| v.as_slice())
} else {
None
}
}
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum BfRangeTarget {
HexString(Vec<u16>),
UTF16CodePoint { offset: u32 },
ArrayOfHexStrings(Vec<Vec<u16>>),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn put_char_can_be_retrieved() {
let mut cmap = ToUnicodeCMap::new();
let char_code = 0x01;
let char_value = vec![0x1234];
cmap.put_char(char_code, 2, char_value.clone());
assert_eq!(cmap.get(char_code, 2), Some(char_value))
}
#[test]
fn char_can_be_retrieved_only_by_appropriate_len() {
let mut cmap = ToUnicodeCMap::new();
let char_code = 0x1;
let code_len = 4;
let char_value = vec![0x1234];
cmap.put_char(char_code, code_len, char_value.clone());
for i in 1..=3 {
assert_eq!(cmap.get(char_code, i), None);
}
assert_eq!(cmap.get(char_code, code_len), Some(char_value));
}
#[test]
fn wrong_code_len_does_not_panic() {
let mut cmap = ToUnicodeCMap::new();
let char_code = 0x1;
let char_value = vec![0x1234];
cmap.put_char(char_code, 5, char_value.clone());
cmap.put_char(char_code, 0, char_value.clone());
}
}