use crate::hwp::model::{HwpControl, HwpParagraph};
use crate::hwp::record::{read_utf16le_str, Record};
const CTRL_CHAR_LOW: u16 = 0x0001;
const CTRL_CHAR_HIGH: u16 = 0x001F;
const CTRL_MARKER: u16 = 0x0003;
const CTRL_PARAM_BYTES: usize = 14;
pub(crate) fn fixup_ruby_base_text(para: &mut HwpParagraph) {
let raw = match para.raw_para_text.as_deref() {
Some(r) if !r.is_empty() => r,
_ => return,
};
let mut base_ranges: Vec<(usize, usize)> = Vec::new();
let len = raw.len();
let mut i = 0usize;
let mut run_start = 0usize;
while i + 1 < len {
let ch = u16::from_le_bytes([raw[i], raw[i + 1]]);
if ch == CTRL_MARKER {
base_ranges.push((run_start, i));
i += 2 + CTRL_PARAM_BYTES;
run_start = i;
} else if (CTRL_CHAR_LOW..=CTRL_CHAR_HIGH).contains(&ch) {
i += 2 + CTRL_PARAM_BYTES;
run_start = i;
} else {
i += 2;
}
}
let mut ruby_iter = base_ranges.into_iter();
for ctrl in para.controls.iter_mut() {
if let HwpControl::Ruby { base_text, .. } = ctrl {
if let Some((start, end)) = ruby_iter.next() {
if start < end && end <= raw.len() {
*base_text = decode_utf16le_text_run(&raw[start..end]);
}
}
}
}
}
fn decode_utf16le_text_run(bytes: &[u8]) -> String {
let mut units: Vec<u16> = Vec::with_capacity(bytes.len() / 2);
let mut i = 0;
while i + 1 < bytes.len() {
units.push(u16::from_le_bytes([bytes[i], bytes[i + 1]]));
i += 2;
}
String::from_utf16_lossy(&units).to_owned()
}
pub(crate) fn parse_ruby_ctrl(rec: &Record) -> Option<String> {
if rec.data.len() < 6 {
return None;
}
let (ruby_text, _) = read_utf16le_str(&rec.data, 4);
Some(ruby_text)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hwp::record::{CTRL_RUBY, HWPTAG_CTRL_HEADER};
fn make_ruby_record(ruby_text: &str) -> Record {
let chars: Vec<u16> = ruby_text.encode_utf16().collect();
let mut data = CTRL_RUBY.to_le_bytes().to_vec();
data.extend_from_slice(&(chars.len() as u16).to_le_bytes());
for ch in &chars {
data.extend_from_slice(&ch.to_le_bytes());
}
Record {
tag_id: HWPTAG_CTRL_HEADER,
level: 0,
data,
}
}
fn encode_u16s(units: &[u16]) -> Vec<u8> {
let mut buf = Vec::with_capacity(units.len() * 2);
for &u in units {
buf.push((u & 0xFF) as u8);
buf.push((u >> 8) as u8);
}
buf
}
fn make_para_with_ruby(raw: Vec<u8>, ruby_text: &str) -> HwpParagraph {
HwpParagraph {
text: String::new(),
char_shape_ids: Vec::new(),
para_shape_id: 0,
controls: vec![HwpControl::Ruby {
base_text: String::new(),
ruby_text: ruby_text.to_string(),
}],
raw_para_text: Some(raw),
}
}
#[test]
fn parse_ruby_ctrl_korean_annotation() {
let rec = make_ruby_record("한자");
let result = parse_ruby_ctrl(&rec).expect("should return Some");
assert_eq!(result, "한자");
}
#[test]
fn parse_ruby_ctrl_ascii_annotation() {
let rec = make_ruby_record("kanji");
let result = parse_ruby_ctrl(&rec).expect("should return Some");
assert_eq!(result, "kanji");
}
#[test]
fn parse_ruby_ctrl_empty_annotation() {
let rec = make_ruby_record("");
let result = parse_ruby_ctrl(&rec).expect("should return Some for empty annotation");
assert_eq!(result, "");
}
#[test]
fn parse_ruby_ctrl_too_short_returns_none() {
let data = CTRL_RUBY.to_le_bytes().to_vec();
let rec = Record {
tag_id: HWPTAG_CTRL_HEADER,
level: 0,
data,
};
assert!(parse_ruby_ctrl(&rec).is_none());
}
#[test]
fn parse_ruby_ctrl_exactly_six_bytes_empty_annotation() {
let mut data = CTRL_RUBY.to_le_bytes().to_vec();
data.extend_from_slice(&0u16.to_le_bytes());
let rec = Record {
tag_id: HWPTAG_CTRL_HEADER,
level: 0,
data,
};
let result = parse_ruby_ctrl(&rec).expect("6-byte record must return Some");
assert!(result.is_empty());
}
fn make_raw_with_base(base_chars: &str) -> Vec<u8> {
let units: Vec<u16> = base_chars.encode_utf16().collect();
let mut raw = encode_u16s(&units);
raw.extend_from_slice(&CTRL_MARKER.to_le_bytes());
raw.extend_from_slice(&[0u8; CTRL_PARAM_BYTES]);
raw
}
#[test]
fn fixup_ruby_base_text_sets_base_text_from_raw() {
let raw = make_raw_with_base("漢字");
let mut para = make_para_with_ruby(raw, "한자");
fixup_ruby_base_text(&mut para);
if let HwpControl::Ruby {
base_text,
ruby_text,
} = ¶.controls[0]
{
assert_eq!(base_text, "漢字");
assert_eq!(ruby_text, "한자");
} else {
panic!("expected Ruby control");
}
}
#[test]
fn fixup_ruby_base_text_empty_base_at_paragraph_start() {
let mut raw = CTRL_MARKER.to_le_bytes().to_vec();
raw.extend_from_slice(&[0u8; CTRL_PARAM_BYTES]);
let mut para = make_para_with_ruby(raw, "ルビ");
fixup_ruby_base_text(&mut para);
if let HwpControl::Ruby { base_text, .. } = ¶.controls[0] {
assert!(
base_text.is_empty(),
"base_text at paragraph start must be empty, got {base_text:?}"
);
} else {
panic!("expected Ruby control");
}
}
#[test]
fn fixup_ruby_base_text_multiple_ruby_controls() {
let units_ab: Vec<u16> = "AB".encode_utf16().collect();
let units_cd: Vec<u16> = "CD".encode_utf16().collect();
let mut raw = encode_u16s(&units_ab);
raw.extend_from_slice(&CTRL_MARKER.to_le_bytes());
raw.extend_from_slice(&[0u8; CTRL_PARAM_BYTES]);
raw.extend_from_slice(&encode_u16s(&units_cd));
raw.extend_from_slice(&CTRL_MARKER.to_le_bytes());
raw.extend_from_slice(&[0u8; CTRL_PARAM_BYTES]);
let mut para = HwpParagraph {
text: String::new(),
char_shape_ids: Vec::new(),
para_shape_id: 0,
controls: vec![
HwpControl::Ruby {
base_text: String::new(),
ruby_text: "ann1".to_string(),
},
HwpControl::Ruby {
base_text: String::new(),
ruby_text: "ann2".to_string(),
},
],
raw_para_text: Some(raw),
};
fixup_ruby_base_text(&mut para);
let bases: Vec<&str> = para
.controls
.iter()
.filter_map(|c| {
if let HwpControl::Ruby { base_text, .. } = c {
Some(base_text.as_str())
} else {
None
}
})
.collect();
assert_eq!(bases, vec!["AB", "CD"]);
}
#[test]
fn fixup_ruby_base_text_no_raw_does_nothing() {
let mut para = HwpParagraph {
text: String::new(),
char_shape_ids: Vec::new(),
para_shape_id: 0,
controls: vec![HwpControl::Ruby {
base_text: String::new(),
ruby_text: "ann".to_string(),
}],
raw_para_text: None,
};
fixup_ruby_base_text(&mut para);
if let HwpControl::Ruby { base_text, .. } = ¶.controls[0] {
assert!(base_text.is_empty(), "should remain empty when no raw data");
}
}
#[test]
fn fixup_ruby_base_text_other_ctrl_char_breaks_run() {
let mut raw = Vec::new();
let units_hi: Vec<u16> = "Hi".encode_utf16().collect();
raw.extend_from_slice(&encode_u16s(&units_hi));
raw.extend_from_slice(&0x0001u16.to_le_bytes());
raw.extend_from_slice(&[0u8; CTRL_PARAM_BYTES]);
raw.extend_from_slice(&CTRL_MARKER.to_le_bytes());
raw.extend_from_slice(&[0u8; CTRL_PARAM_BYTES]);
let mut para = make_para_with_ruby(raw, "ann");
fixup_ruby_base_text(&mut para);
if let HwpControl::Ruby { base_text, .. } = ¶.controls[0] {
assert!(
base_text.is_empty(),
"run must be broken by other ctrl char; got {base_text:?}"
);
} else {
panic!("expected Ruby control");
}
}
}