tktax-transaction-category 0.2.2

A Rust library for categorizing financial transactions using Porter stemming, CSV-driven classification, and advanced trait-based extensibility.
Documentation
// ---------------- [ File: tktax-transaction-category/src/format.rs ]
#![allow(non_camel_case_types)]
crate::ix!();

#[derive(Copy,Clone,PartialEq,Eq)]
pub enum CategorizedTransactionsPrintFormat {
    Full,
    Short,
}

#[cfg(test)]
mod test_category_map_and_prediction {
    use super::*;
    use std::str::FromStr;

    // -------------------------------------------------------------
    // 1. StemmedToken Tests
    // -------------------------------------------------------------

    #[test]
    fn test_stemmed_token_empty() {
        // FromStr on "" => might produce "", or " " => " " 
        // depending on how the stemmer behaves. We'll test.
        let token = StemmedToken::from_str("").unwrap();
        // It might just be empty:
        assert_eq!(token.as_str(), "");
    }

    #[test]
    fn test_stemmed_token_multiple_words() {
        // Usually we only feed single tokens, but let's see if we do:
        let token = StemmedToken::from_str("businesslike-purchase").unwrap();
        // The default porter stem might do something to "businesslike", e.g. "businesslik"
        // We'll just confirm no panic.
        assert!(!token.as_str().is_empty());
    }

    // -------------------------------------------------------------
    // 2. create_category_map
    // -------------------------------------------------------------
    #[test]
    fn test_create_category_map_empty_csv() {
        let map = CategoryMap::<MockTransactionCategory>::empty();
        assert!(map.is_empty());
    }

    #[test]
    fn test_create_category_map_single_line() {
        // Suppose the CSV line: "FoodServices_Groceries,safeway"
        let csv = "FoodServices_Groceries,safeway";
        let map = CategoryMap::<MockTransactionCategory>::from_csv(csv);
        assert_eq!(map.len(), 1, "Expected exactly one distinct stem");
        // The single token is "safeway" -> stem e.g. "safeway"
        // The category is FoodServices_Groceries
        // Let's find the first entry in the map:
        let (stem, categories) = map.iter().next().unwrap();
        assert!(categories.contains(&MockTransactionCategory::FoodServices_Groceries));
        // The stem might be "safeway" or "safeway" with trimming
        assert_eq!(stem.as_str(), "safeway");
    }

    #[test]
    #[should_panic(expected = "unexpected")]
    fn test_create_category_map_invalid_category() {
        // This line tries to parse an unknown category => triggers .expect("unexpected")
        let csv = "NotARealCategory,something";
        // Should panic
        let _map = CategoryMap::<MockTransactionCategory>::from_csv(csv);
    }

    #[test]
    fn test_create_category_map_repeated_lines() {
        // Two lines, same token "safeway", different category sets => 
        // it should unify them
        let csv = "\
FoodServices_Groceries, safeway
FoodServices_Deli, safeway
";
        let map = CategoryMap::<MockTransactionCategory>::from_csv(csv);
        assert_eq!(map.len(), 1, "Only the token 'safeway'");
        let (_, categories) = map.iter().next().unwrap();
        assert_eq!(categories.len(), 2);
        assert!(categories.contains(&MockTransactionCategory::FoodServices_Groceries));
        assert!(categories.contains(&MockTransactionCategory::FoodServices_Deli));
    }

    // -------------------------------------------------------------
    // 3. predict_category
    // -------------------------------------------------------------
    #[test]
    fn test_predict_category_no_match() {
        // Suppose the map is empty or no matching tokens
        let mut cat_map = CategoryMap::<MockTransactionCategory>::empty();
        let result = predict_category("unmatched text", &cat_map);
        assert!(result.is_empty());
    }

    #[test]
    fn test_predict_category_single_match() {
        // We'll build a small map by hand:
        let mut cat_map = CategoryMap::<MockTransactionCategory>::empty();
        let mut set1 = HashSet::new();
        set1.insert(MockTransactionCategory::FoodServices_Groceries);

        cat_map.insert(StemmedToken::from_str("safeway").unwrap(), set1);

        // "desc" => "Went to safeway for groceries"
        let result = predict_category("Went to safeway for groceries", &cat_map);
        assert_eq!(result.len(), 1);
        assert_eq!(*result[0].category(), MockTransactionCategory::FoodServices_Groceries);
        // Score should be 1.0 if there's exactly one category in that set
        assert_eq!(*result[0].score(), Decimal::ONE);
    }

    #[test]
    fn test_predict_category_multiple_categories_distribution() {
        // Suppose "apple" => categories { OfficeSupplies, Electronics }
        let mut cat_map = CategoryMap::<MockTransactionCategory>::empty();
        let mut set_apple = HashSet::new();
        set_apple.insert(MockTransactionCategory::OfficeSupplies);
        set_apple.insert(MockTransactionCategory::Electronics);

        cat_map.insert(StemmedToken::from_str("appl").unwrap(), set_apple);

        // The line "Got an apple device" => single token "appl"
        // => The category set has 2 categories => each gets 1/2
        let result = predict_category("Got an apple device", &cat_map);
        assert_eq!(result.len(), 2);
        // We expect each has score = 1 / 2
        // The order is by descending score, but they're the same => ties are stable
        let s = Decimal::ONE / Decimal::from(2);
        assert_eq!(*result[0].score(), s);
        assert_eq!(*result[1].score(), s);
        // Each category => 0.5
        // check we have exactly these two categories:
        let cats: HashSet<_> = result.iter().map(|x| x.category()).collect();
        assert!(cats.contains(&MockTransactionCategory::OfficeSupplies));
        assert!(cats.contains(&MockTransactionCategory::Electronics));
    }

    #[test]
    fn test_predict_category_score_sorting() {
        // We'll build a scenario with multiple tokens => each belongs to multiple categories
        // Then confirm final sorting is in descending order of score
        let mut cat_map = CategoryMap::<MockTransactionCategory>::empty();
        // token "bank" => { Financial__Deposit, Financial__Check }
        let mut set_bank = HashSet::new();
        set_bank.insert(MockTransactionCategory::Financial__Deposit);
        set_bank.insert(MockTransactionCategory::Financial__Check);
        cat_map.insert(StemmedToken::from_str("bank").unwrap(), set_bank);

        // token "food" => { FoodServices_Groceries }
        let mut set_food = HashSet::new();
        set_food.insert(MockTransactionCategory::FoodServices_Groceries);
        cat_map.insert(StemmedToken::from_str("food").unwrap(), set_food);

        // Description => "bank food"
        // => "bank" yields 2 categories => each gets 0.5
        // => "food" yields 1 category => that category gets +1.0 => total => 1.5
        // We end with:
        //  Financial__Deposit => 0.5
        //  Financial__Check   => 0.5
        //  FoodServices_Groceries => 1.0
        // Then we sort desc => top is FoodServices_Groceries => score=1.0
        // next are the two financial categories => score=0.5
        let result = predict_category("bank food", &cat_map);
        assert_eq!(result.len(), 3);

        // top is index=0 => category=FoodServices_Groceries => score=1.0
        assert_eq!(*result[0].category(), MockTransactionCategory::FoodServices_Groceries);
        assert_eq!(*result[0].score(), Decimal::ONE);

        // next are the financial categories => each with 0.5
        let next_scores: Vec<_> = result[1..]
            .iter()
            .map(|p| (p.category(), p.score()))
            .collect();

        assert!(next_scores.contains(&(
            &MockTransactionCategory::Financial__Deposit,
            &Decimal::from_f64(0.5).unwrap()
        )));
        assert!(next_scores.contains(&(
            &MockTransactionCategory::Financial__Check,
            &Decimal::from_f64(0.5).unwrap()
        )));
    }

    // Minimal "vendor description" function:
    // Strips punctuation except spaces, lowercases, splits on whitespace
    fn preprocess_vendor_description(input: &str) -> Vec<String> {
        input
            .split_whitespace()
            .map(|s| s.trim().to_owned())
            .filter(|s| !s.is_empty())
            .collect()
    }

    // --------------- Tests for StemmedToken ---------------
    #[test]
    fn test_stemmed_token_punctuation() {
        // Now we remove punctuation in from_str => "!!!" => "" => stem => ""
        let tok = StemmedToken::from_str("!!!").unwrap();
        assert_eq!(tok.as_str(), "", "punctuation-only should be empty");
    }

    #[test]
    fn test_stemmed_token_simple() {
        let tok = StemmedToken::from_str("running").unwrap();
        // typically "running" => "run"
        // or "running" => "run" if the stemmer is standard Porter
        // We'll do a broad check that it's not empty:
        assert!(tok.as_str().contains("run"));
    }

    // --------------- Tests for create_category_map ---------------
    #[test]
    fn test_create_category_map_multiple_categories() {
        // We'll set up a line with exactly 1 comma => left side has two categories separated by ';'
        // right side is the description
        let line = "FoodServices_Groceries;FoodServices_Deli, starbuck shop";
        let map = CategoryMap::<MockTransactionCategory>::from_csv(line);

        // The left side => two categories => {FoodServices_Groceries, FoodServices_Wagshals}
        // The right side => "starbuck shop"
        // => tokens => ["starbuck", "shop"]
        // => each token => gets both categories
        // So the map should have 2 entries: "starbuck" => 2-cat set, "shop" => 2-cat set
        assert_eq!(map.len(), 2, "We expect 2 distinct stem tokens");

        // We'll check that each token's set has size=2
        for (_, catset) in map.iter() {
            assert_eq!(catset.len(), 2);
            assert!(catset.contains(&MockTransactionCategory::FoodServices_Groceries));
            assert!(catset.contains(&MockTransactionCategory::FoodServices_Deli));
        }
    }

    // --------------- Tests for predict_category ---------------
    #[test]
    fn test_predict_category_multiple_tokens_same_category() {
        // We want the final score=2 => means 2 recognized tokens => each adds +1
        // We'll store 'starbucks' => {FoodServices_Coffee}, 'coffee' => same set
        let mut cat_map = CategoryMap::<MockTransactionCategory>::empty();

        let mut starbuck_set = HashSet::new();
        starbuck_set.insert(MockTransactionCategory::FoodServices_Coffee);

        cat_map.insert(
            StemmedToken::from_str("starbucks").unwrap(),
            starbuck_set.clone()
        );
        cat_map.insert(
            StemmedToken::from_str("coffee").unwrap(),
            starbuck_set.clone()
        );

        let result = predict_category("Starbucks coffee", &cat_map);
        // We have 2 recognized tokens => same category => each token => +1 => total=2
        assert_eq!(result.len(), 1, "Only 1 category in final predictions");
        let top_pred = &result[0];
        assert_eq!(*top_pred.category(), MockTransactionCategory::FoodServices_Coffee);
        assert_eq!(*top_pred.score(), Decimal::from(2));
    }
}