hanzi-sort 0.1.1

Sort Chinese text by pinyin or stroke count, with polyphonic overrides and terminal-friendly output
use std::cmp::Ordering;

use smallvec::SmallVec;

use crate::pinyin::{EncodedSortKey, PinyinContext, compare_encoded_sort_key};
use crate::stroke::stroke_count_for_char;

const INLINE_STROKE_KEY_LEN: usize = 8;

#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum SortMode {
    #[default]
    Pinyin,
    Strokes,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct StrokeSortToken {
    character: char,
    strokes: Option<u16>,
}

type StrokeSortKey = SmallVec<[StrokeSortToken; INLINE_STROKE_KEY_LEN]>;

pub fn sort_strings(input: Vec<String>, context: &PinyinContext) -> Vec<String> {
    sort_strings_by(input, context, SortMode::Pinyin)
}

pub fn sort_strings_by(input: Vec<String>, context: &PinyinContext, mode: SortMode) -> Vec<String> {
    match mode {
        SortMode::Pinyin => sort_by_pinyin(input, context),
        SortMode::Strokes => sort_by_strokes(input),
    }
}

fn sort_by_pinyin(input: Vec<String>, context: &PinyinContext) -> Vec<String> {
    let mut with_keys: Vec<(EncodedSortKey, String)> = input
        .into_iter()
        .map(|item| {
            let key = context.encoded_sort_key(&item);
            (key, item)
        })
        .collect();

    with_keys.sort_unstable_by(|a, b| compare_encoded_sort_key(&a.0, &b.0));

    with_keys.into_iter().map(|(_, item)| item).collect()
}

fn sort_by_strokes(input: Vec<String>) -> Vec<String> {
    let mut with_keys: Vec<(StrokeSortKey, String)> = input
        .into_iter()
        .map(|item| {
            let key = stroke_sort_key(&item);
            (key, item)
        })
        .collect();

    with_keys.sort_unstable_by(|a, b| compare_stroke_sort_key(&a.0, &b.0));

    with_keys.into_iter().map(|(_, item)| item).collect()
}

fn stroke_sort_key(value: &str) -> StrokeSortKey {
    value
        .chars()
        .map(|character| StrokeSortToken {
            character,
            strokes: stroke_count_for_char(character),
        })
        .collect()
}

fn compare_stroke_sort_key(a: &StrokeSortKey, b: &StrokeSortKey) -> Ordering {
    let len = a.len().min(b.len());

    for index in 0..len {
        let ordering = compare_stroke_token(&a[index], &b[index]);
        if ordering != Ordering::Equal {
            return ordering;
        }
    }

    a.len().cmp(&b.len())
}

fn compare_stroke_token(a: &StrokeSortToken, b: &StrokeSortToken) -> Ordering {
    match (a.strokes, b.strokes) {
        (Some(sa), Some(sb)) => sa.cmp(&sb).then_with(|| a.character.cmp(&b.character)),
        (Some(_), None) => Ordering::Less,
        (None, Some(_)) => Ordering::Greater,
        (None, None) => a.character.cmp(&b.character),
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::pinyin::PinyinContext;

    #[test]
    fn test_sort_strings() {
        let input = vec![
            "汉字".to_string(),
            "照相".to_string(),
            "赵云".to_string(),
            "赵四".to_string(),
            "张三".to_string(),
        ];
        let sorted = sort_strings(input, &PinyinContext::default());
        assert_eq!(sorted, vec!["汉字", "张三", "照相", "赵四", "赵云"]);
    }

    #[test]
    fn test_unknown_characters_sort_after_mapped_characters() {
        let context = PinyinContext::default();
        let sorted = sort_strings(
            vec!["abc".to_string(), "张三".to_string(), "123".to_string()],
            &context,
        );
        assert_eq!(sorted, vec!["张三", "123", "abc"]);
    }

    #[test]
    fn encoded_sort_key_preserves_ordering_for_known_strings() {
        let context = PinyinContext::default();
        let encoded_a = context.encoded_sort_key("张三");
        let encoded_b = context.encoded_sort_key("赵四");
        assert_eq!(
            compare_encoded_sort_key(&encoded_a, &encoded_b),
            std::cmp::Ordering::Less
        );
    }

    #[test]
    fn test_same_pinyin_uses_original_character_tiebreak() {
        let context = PinyinContext::default();
        let sorted = sort_strings(vec!["".to_string(), "".to_string()], &context);
        assert_eq!(sorted, vec!["", ""]);
    }

    #[test]
    fn test_shorter_prefix_sorts_first_when_keys_match() {
        let context = PinyinContext::default();
        let sorted = sort_strings(vec!["张三丰".to_string(), "张三".to_string()], &context);
        assert_eq!(sorted, vec!["张三", "张三丰"]);
    }

    #[test]
    fn sorts_by_stroke_count_when_requested() {
        let sorted = sort_strings_by(
            vec![
                "".to_string(),
                "".to_string(),
                "".to_string(),
                "".to_string(),
            ],
            &PinyinContext::default(),
            SortMode::Strokes,
        );
        assert_eq!(sorted, vec!["", "", "", ""]);
    }

    #[test]
    fn stroke_sort_uses_original_character_as_tiebreak() {
        let sorted = sort_strings_by(
            vec!["".to_string(), "".to_string()],
            &PinyinContext::default(),
            SortMode::Strokes,
        );
        assert_eq!(sorted, vec!["", ""]);
    }

    #[test]
    fn stroke_sort_places_unknown_characters_after_known_ones() {
        let sorted = sort_strings_by(
            vec!["abc".to_string(), "".to_string(), "1".to_string()],
            &PinyinContext::default(),
            SortMode::Strokes,
        );
        assert_eq!(sorted, vec!["", "1", "abc"]);
    }
}