pub mod japanese_iteration_mark;
pub mod mapping;
pub mod regex;
pub mod unicode_normalize;
use std::ops::Deref;
use serde_json::Value;
use crate::LinderaResult;
use crate::character_filter::japanese_iteration_mark::{
JAPANESE_ITERATION_MARK_CHARACTER_FILTER_NAME, JapaneseIterationMarkCharacterFilter,
};
use crate::character_filter::mapping::{MAPPING_CHARACTER_FILTER_NAME, MappingCharacterFilter};
use crate::character_filter::regex::{REGEX_CHARACTER_FILTER_NAME, RegexCharacterFilter};
use crate::character_filter::unicode_normalize::{
UNICODE_NORMALIZE_CHARACTER_FILTER_NAME, UnicodeNormalizeCharacterFilter,
};
use crate::error::LinderaErrorKind;
use crate::parse_cli_flag;
#[derive(Debug, Clone, PartialEq)]
pub struct Transformation {
pub original_start: usize,
pub original_end: usize,
pub filtered_start: usize,
pub filtered_end: usize,
}
impl Transformation {
pub fn new(
original_start: usize,
original_end: usize,
filtered_start: usize,
filtered_end: usize,
) -> Self {
Self {
original_start,
original_end,
filtered_start,
filtered_end,
}
}
}
#[derive(Debug, Clone, Default, PartialEq)]
pub struct OffsetMapping {
pub transformations: Vec<Transformation>,
}
impl OffsetMapping {
pub fn new() -> Self {
Self::default()
}
pub fn with_transformations(transformations: Vec<Transformation>) -> Self {
Self { transformations }
}
pub fn add_transformation(&mut self, transformation: Transformation) {
self.transformations.push(transformation);
}
pub fn is_empty(&self) -> bool {
self.transformations.is_empty()
}
pub fn correct_offset(&self, offset: usize, text_len: usize) -> usize {
if self.transformations.is_empty() {
return offset;
}
let clamped_offset = offset.min(text_len);
for transformation in &self.transformations {
if clamped_offset >= transformation.filtered_start
&& clamped_offset <= transformation.filtered_end
{
let filtered_offset = clamped_offset - transformation.filtered_start;
let original_len = transformation.original_end - transformation.original_start;
let filtered_len = transformation.filtered_end - transformation.filtered_start;
if filtered_len == 0 {
return transformation.original_start;
} else if original_len == 0 {
return transformation.original_start;
} else {
let ratio = filtered_offset as f64 / filtered_len as f64;
let original_offset = (ratio * original_len as f64).round() as usize;
return transformation.original_start + original_offset;
}
} else if clamped_offset < transformation.filtered_start {
let mut corrected = clamped_offset;
for prev_transform in &self.transformations {
if prev_transform.filtered_start < transformation.filtered_start {
let original_len =
prev_transform.original_end - prev_transform.original_start;
let filtered_len =
prev_transform.filtered_end - prev_transform.filtered_start;
let diff = original_len as i64 - filtered_len as i64;
corrected = (corrected as i64 + diff) as usize;
}
}
return corrected;
}
}
let mut corrected = clamped_offset;
for transformation in &self.transformations {
let original_len = transformation.original_end - transformation.original_start;
let filtered_len = transformation.filtered_end - transformation.filtered_start;
let diff = original_len as i64 - filtered_len as i64;
corrected = (corrected as i64 + diff) as usize;
}
if offset > text_len {
let overshoot = offset - text_len;
corrected + overshoot
} else {
corrected
}
}
pub fn compose(self, other: OffsetMapping) -> OffsetMapping {
if other.transformations.is_empty() {
return self;
}
if self.transformations.is_empty() {
return other;
}
let mut combined_transformations = self.transformations;
combined_transformations.extend(other.transformations);
OffsetMapping {
transformations: combined_transformations,
}
}
}
pub trait CharacterFilter: 'static + Send + Sync + CharacterFilterClone {
fn name(&self) -> &str;
fn apply(&self, text: &mut String) -> LinderaResult<OffsetMapping>;
}
pub struct BoxCharacterFilter(Box<dyn CharacterFilter + 'static + Send + Sync>);
impl Deref for BoxCharacterFilter {
type Target = dyn CharacterFilter;
fn deref(&self) -> &dyn CharacterFilter {
&*self.0
}
}
impl<T: CharacterFilter> From<T> for BoxCharacterFilter {
fn from(character_filter: T) -> BoxCharacterFilter {
BoxCharacterFilter(Box::new(character_filter))
}
}
pub trait CharacterFilterClone {
fn box_clone(&self) -> BoxCharacterFilter;
}
impl<T: CharacterFilter + Clone + 'static> CharacterFilterClone for T {
fn box_clone(&self) -> BoxCharacterFilter {
BoxCharacterFilter::from(self.clone())
}
}
pub struct CharacterFilterLoader {}
impl CharacterFilterLoader {
pub fn load_from_value(kind: &str, value: &Value) -> LinderaResult<BoxCharacterFilter> {
let character_filter = match kind {
JAPANESE_ITERATION_MARK_CHARACTER_FILTER_NAME => {
BoxCharacterFilter::from(JapaneseIterationMarkCharacterFilter::from_config(value)?)
}
MAPPING_CHARACTER_FILTER_NAME => {
BoxCharacterFilter::from(MappingCharacterFilter::from_config(value)?)
}
REGEX_CHARACTER_FILTER_NAME => {
BoxCharacterFilter::from(RegexCharacterFilter::from_config(value)?)
}
UNICODE_NORMALIZE_CHARACTER_FILTER_NAME => {
BoxCharacterFilter::from(UnicodeNormalizeCharacterFilter::from_config(value)?)
}
_ => {
return Err(LinderaErrorKind::Deserialize
.with_error(anyhow::anyhow!("unsupported character filter: {kind}")));
}
};
Ok(character_filter)
}
pub fn load_from_cli_flag(cli_flag: &str) -> LinderaResult<BoxCharacterFilter> {
let (kind, args) = parse_cli_flag(cli_flag)?;
let character_filter = Self::load_from_value(kind, &args)?;
Ok(character_filter)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_transformation() {
let transformation = Transformation::new(0, 3, 0, 1);
assert_eq!(transformation.original_start, 0);
assert_eq!(transformation.original_end, 3);
assert_eq!(transformation.filtered_start, 0);
assert_eq!(transformation.filtered_end, 1);
}
#[test]
fn test_offset_mapping_empty() {
let mapping = OffsetMapping::new();
assert!(mapping.is_empty());
assert_eq!(5, mapping.correct_offset(5, 10));
assert_eq!(0, mapping.correct_offset(0, 10));
}
#[test]
fn test_offset_mapping_with_transformation() {
let mut mapping = OffsetMapping::new();
mapping.add_transformation(Transformation::new(0, 3, 0, 1));
assert!(!mapping.is_empty());
assert_eq!(0, mapping.correct_offset(0, 8)); assert_eq!(3, mapping.correct_offset(1, 8)); assert_eq!(5, mapping.correct_offset(3, 8)); }
#[test]
fn test_offset_mapping_compose() {
let mut mapping1 = OffsetMapping::new();
mapping1.add_transformation(Transformation::new(0, 3, 0, 1));
let mut mapping2 = OffsetMapping::new();
mapping2.add_transformation(Transformation::new(1, 2, 1, 4));
let composed = mapping1.compose(mapping2);
assert_eq!(composed.transformations.len(), 2);
}
}