#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TextDensity {
Low,
Medium,
High,
}
impl TextDensity {
pub fn classify(char_count: usize, page_count: usize) -> Self {
if page_count == 0 {
return Self::Medium; }
let chars_per_page = char_count / page_count;
match chars_per_page {
0..=500 => Self::Low,
501..=2000 => Self::Medium,
_ => Self::High,
}
}
pub fn score_multiplier(&self) -> f32 {
match self {
Self::Low => 0.6, Self::Medium => 1.0, Self::High => 1.4, }
}
}
pub fn is_fullwidth_punctuation(code: u32) -> bool {
is_sentence_ending_punctuation(code)
|| is_enumeration_punctuation(code)
|| is_bracket_punctuation(code)
|| is_other_cjk_punctuation(code)
}
pub fn is_sentence_ending_punctuation(code: u32) -> bool {
matches!(
code,
0x3002 | 0xFF01 | 0xFF1F )
}
pub fn is_enumeration_punctuation(code: u32) -> bool {
matches!(
code,
0x3001 | 0xFF0C | 0xFF1B | 0xFF1A )
}
pub fn is_bracket_punctuation(code: u32) -> bool {
matches!(
code,
0x3008..=0x3011 | 0x3014..=0x3015 | 0xFF08..=0xFF09 | 0xFF3B..=0xFF3D | 0xFF5B..=0xFF5D )
}
pub fn is_other_cjk_punctuation(code: u32) -> bool {
matches!(
code,
0x3000 | 0x3003 | 0x30FB | 0xFF0E | 0xFF0D | 0xFF5E )
}
pub fn get_cjk_punctuation_boundary_score(code: u32, density: Option<TextDensity>) -> f32 {
let base_score = get_base_punctuation_score(code);
if let Some(d) = density {
base_score * d.score_multiplier()
} else {
base_score
}
}
fn get_base_punctuation_score(code: u32) -> f32 {
if is_sentence_ending_punctuation(code) {
1.0 } else if is_enumeration_punctuation(code) {
0.9 } else if is_bracket_punctuation(code) {
0.8 } else if is_other_cjk_punctuation(code) {
0.7 } else {
0.0 }
}
pub fn is_opening_bracket(code: u32) -> bool {
matches!(
code,
0x3008 | 0x300A | 0x300C | 0x300E | 0x3010 | 0x3014 | 0xFF08 | 0xFF3B | 0xFF5B )
}
pub fn is_closing_bracket(code: u32) -> bool {
matches!(
code,
0x3009 | 0x300B | 0x300D | 0x300F | 0x3011 | 0x3015 | 0xFF09 | 0xFF3D | 0xFF5D )
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ideographic_full_stop() {
assert!(is_sentence_ending_punctuation(0x3002));
assert!(is_fullwidth_punctuation(0x3002));
assert_eq!(get_cjk_punctuation_boundary_score(0x3002, None), 1.0);
}
#[test]
fn test_fullwidth_question_mark() {
assert!(is_sentence_ending_punctuation(0xFF1F));
assert!(is_fullwidth_punctuation(0xFF1F));
assert_eq!(get_cjk_punctuation_boundary_score(0xFF1F, None), 1.0);
}
#[test]
fn test_fullwidth_exclamation() {
assert!(is_sentence_ending_punctuation(0xFF01));
assert!(is_fullwidth_punctuation(0xFF01));
assert_eq!(get_cjk_punctuation_boundary_score(0xFF01, None), 1.0);
}
#[test]
fn test_ideographic_comma() {
assert!(is_enumeration_punctuation(0x3001));
assert!(is_fullwidth_punctuation(0x3001));
assert_eq!(get_cjk_punctuation_boundary_score(0x3001, None), 0.9);
}
#[test]
fn test_fullwidth_comma() {
assert!(is_enumeration_punctuation(0xFF0C));
assert!(is_fullwidth_punctuation(0xFF0C));
assert_eq!(get_cjk_punctuation_boundary_score(0xFF0C, None), 0.9);
}
#[test]
fn test_fullwidth_semicolon() {
assert!(is_enumeration_punctuation(0xFF1B));
assert!(is_fullwidth_punctuation(0xFF1B));
assert_eq!(get_cjk_punctuation_boundary_score(0xFF1B, None), 0.9);
}
#[test]
fn test_fullwidth_colon() {
assert!(is_enumeration_punctuation(0xFF1A));
assert!(is_fullwidth_punctuation(0xFF1A));
assert_eq!(get_cjk_punctuation_boundary_score(0xFF1A, None), 0.9);
}
#[test]
fn test_fullwidth_parentheses() {
assert!(is_bracket_punctuation(0xFF08));
assert!(is_opening_bracket(0xFF08));
assert!(is_bracket_punctuation(0xFF09));
assert!(is_closing_bracket(0xFF09));
assert_eq!(get_cjk_punctuation_boundary_score(0xFF08, None), 0.8);
}
#[test]
fn test_angle_brackets() {
assert!(is_bracket_punctuation(0x3008));
assert!(is_opening_bracket(0x3008));
assert!(is_bracket_punctuation(0x3009));
assert!(is_closing_bracket(0x3009));
}
#[test]
fn test_ideographic_space() {
assert!(is_other_cjk_punctuation(0x3000));
assert!(is_fullwidth_punctuation(0x3000));
assert_eq!(get_cjk_punctuation_boundary_score(0x3000, None), 0.7);
}
#[test]
fn test_katakana_middle_dot() {
assert!(is_other_cjk_punctuation(0x30FB));
assert!(is_fullwidth_punctuation(0x30FB));
}
#[test]
fn test_non_cjk_punctuation() {
assert!(!is_fullwidth_punctuation(0x002E));
assert_eq!(get_cjk_punctuation_boundary_score(0x002E, None), 0.0);
assert!(!is_fullwidth_punctuation(0x002C));
assert_eq!(get_cjk_punctuation_boundary_score(0x002C, None), 0.0);
}
#[test]
fn test_boundary_score_ordering() {
assert!(
get_cjk_punctuation_boundary_score(0x3002, None)
> get_cjk_punctuation_boundary_score(0x3001, None)
);
assert!(
get_cjk_punctuation_boundary_score(0x3001, None)
> get_cjk_punctuation_boundary_score(0xFF08, None)
);
assert!(
get_cjk_punctuation_boundary_score(0xFF08, None)
> get_cjk_punctuation_boundary_score(0x30FB, None)
);
}
#[test]
fn test_text_density_classify_low() {
let density = TextDensity::classify(400, 1); assert_eq!(density, TextDensity::Low);
}
#[test]
fn test_text_density_classify_medium() {
let density = TextDensity::classify(1200, 1); assert_eq!(density, TextDensity::Medium);
}
#[test]
fn test_text_density_classify_high() {
let density = TextDensity::classify(3000, 1); assert_eq!(density, TextDensity::High);
}
#[test]
fn test_text_density_classify_boundary_low_medium() {
let density_500 = TextDensity::classify(500, 1); let density_501 = TextDensity::classify(501, 1); assert_eq!(density_500, TextDensity::Low);
assert_eq!(density_501, TextDensity::Medium);
}
#[test]
fn test_text_density_classify_boundary_medium_high() {
let density_2000 = TextDensity::classify(2000, 1); let density_2001 = TextDensity::classify(2001, 1); assert_eq!(density_2000, TextDensity::Medium);
assert_eq!(density_2001, TextDensity::High);
}
#[test]
fn test_text_density_score_multiplier_low() {
assert_eq!(TextDensity::Low.score_multiplier(), 0.6);
}
#[test]
fn test_text_density_score_multiplier_medium() {
assert_eq!(TextDensity::Medium.score_multiplier(), 1.0);
}
#[test]
fn test_text_density_score_multiplier_high() {
assert_eq!(TextDensity::High.score_multiplier(), 1.4);
}
#[test]
fn test_punctuation_score_with_density_low() {
let base_score = get_base_punctuation_score(0x3002);
assert_eq!(base_score, 1.0);
let adjusted = get_cjk_punctuation_boundary_score(0x3002, Some(TextDensity::Low));
assert!((adjusted - 0.6).abs() < 0.01);
}
#[test]
fn test_punctuation_score_with_density_medium() {
let base_score = get_base_punctuation_score(0x3002);
let adjusted = get_cjk_punctuation_boundary_score(0x3002, Some(TextDensity::Medium));
assert_eq!(adjusted, base_score);
}
#[test]
fn test_punctuation_score_with_density_high() {
let adjusted = get_cjk_punctuation_boundary_score(0x3008, Some(TextDensity::High));
assert!((adjusted - 1.12).abs() < 0.01);
}
#[test]
fn test_punctuation_score_enumeration_with_density_low() {
let adjusted = get_cjk_punctuation_boundary_score(0x3001, Some(TextDensity::Low));
assert!((adjusted - 0.54).abs() < 0.01);
}
#[test]
fn test_punctuation_score_enumeration_with_density_high() {
let adjusted = get_cjk_punctuation_boundary_score(0x3001, Some(TextDensity::High));
assert!((adjusted - 1.26).abs() < 0.01);
}
#[test]
fn test_punctuation_score_without_density() {
let score_with_none = get_cjk_punctuation_boundary_score(0x3002, None);
let base_score = get_base_punctuation_score(0x3002);
assert_eq!(score_with_none, base_score);
}
#[test]
fn test_density_classify_zero_pages() {
let density = TextDensity::classify(1000, 0);
assert_eq!(density, TextDensity::Medium);
}
#[test]
fn test_density_classify_multi_page() {
let density_per_page_1000 = TextDensity::classify(3000, 3); assert_eq!(density_per_page_1000, TextDensity::Medium);
let density_per_page_300 = TextDensity::classify(900, 3); assert_eq!(density_per_page_300, TextDensity::Low);
let density_per_page_1500 = TextDensity::classify(9000, 6); assert_eq!(density_per_page_1500, TextDensity::Medium);
}
#[test]
fn test_bracket_scores_all_densities() {
let low = get_cjk_punctuation_boundary_score(0xFF08, Some(TextDensity::Low));
let medium = get_cjk_punctuation_boundary_score(0xFF08, Some(TextDensity::Medium));
let high = get_cjk_punctuation_boundary_score(0xFF08, Some(TextDensity::High));
assert!((low - 0.48).abs() < 0.01); assert!((medium - 0.8).abs() < 0.01); assert!((high - 1.12).abs() < 0.01); }
}