onoma 0.0.20

A fast, language-agnostic semantic symbol indexer and typo-resistant fuzzy finder, enabling real-time search across virtually unlimited code symbols without the need for language servers.
Documentation
use crate::{
    models::{
        self,
        parsed::{Language, SymbolKind},
    },
    resolver::constant::{self, DEFAULT_SCORE},
};

/// 8 point bonus during fuzzy matching when the fuzzy match is case-sensitive (i.e. query includes
/// upper case characters), and a capital letter follows a lowercase letter
///
/// I.e. matching a capital letter after a lowercase letter (e.g. "b" on "fooBar" will receive a bonus on "B")
pub const CASE_SENSITIVE_MATCHING_CAPITALISATION_BONUS: u16 = 8;

/// 4 point bonus during fuzzy matching when the fuzzy match is case-sensitive (i.e. query includes
/// upper case characters), and the case of a query and symbol character match.
///
/// I.e. matching the case of the needle (e.g. "WorLd" on "WoRld" will receive a bonus on "W", "o", "d")
pub const CASE_SENSITIVE_MATCHING_CASE_BONUS: u16 = 4;

/// 3.5% bonus for queries which show clear intent to specific symbol kinds.
///
/// For example, all uppercase queries are most commonly looking for constants.
pub const CLEAR_QUERY_INTENT_SYMBOL_KINDS_SCORE_BONUS: i64 = (constant::DEFAULT_SCORE * 35) / 1000;

/// 3.5% bonus for common symbol kinds.
pub const COMMON_SYMBOL_KINDS_SCORE_BONUS: i64 = (constant::DEFAULT_SCORE * 35) / 1000;

/// 1.5% bonus for infrequent symbol kinds.
pub const INFREQUENT_SYMBOL_KINDS_SCORE_BONUS: i64 = (constant::DEFAULT_SCORE * 15) / 1000;

/// -1.5% penalty for uncommon symbol kinds.
pub const UNCOMMON_SYMBOL_KINDS_SCORE_PENALTY: i64 = -((constant::DEFAULT_SCORE * 15) / 1000);

/// 1% penalty for symbols which are part of a test harness (i.e. it's likely a test
/// case, part of a test file, etc.).
pub const TEST_HARNESS_SCORE_PENALTY: i64 = -((constant::DEFAULT_SCORE * 10) / 1000);

/// 0.25% penalty for symbols defined in an entrypoint - this helps to
/// filter out re-exports.
pub const ENTRYPOINT_FILE_SCORE_PENALTY: i64 = -((constant::DEFAULT_SCORE * 25) / 10000);

/// -5% penalty for code generated by tools like GraphQL codegen, Protobuf, etc.
pub const AUTOGENERATED_CODE_SCORE_PENALTY: i64 = -((constant::DEFAULT_SCORE * 50) / 1000);

/// 1% penalty for symbols defined in the same file as the one currently focussed.
///
/// Theres an interesting usability question here: the idea is that it's likely that the
/// intent of a workspace-wide search would be to find symbols which are within close proximity
/// ([`calculate_distance_score_penalty`]) but also not in the same file (i.e. since you could
/// just navigate to that symbol in line). However, how true is this in practice?
pub const SAME_FILE_SCORE_PENALTY: i64 = -((constant::DEFAULT_SCORE * 10) / 1000);

/// -4.5% penalty for symbols defined in TypeScript (or TSX) which are constants that are
/// camel case.
///
/// By convention, constants are used as variables (i.e. `const x = 1;`) in functions, methods, and
/// other constructs. These types of constants are not all that important for searching - global
/// constants defined in call capitals are though (i.e. `const MAX_RETRIES = 10;`)
///
/// See: <https://www.w3schools.com/js/js_const.asp>
pub const JS_TS_CAMEL_CASE_CONSTANT_PENALTY: i64 = -((constant::DEFAULT_SCORE * 45) / 1000);

/// 2% penalty for each directory distance from the current focused file (up to max of
/// 8 directories - aka a 12% penalty)
pub fn calculate_distance_score_penalty(distance: usize) -> i64 {
    const MAX_DISTANCE: i64 = 6;

    if distance == 0 {
        return 0;
    }

    let distance = i64::try_from(distance)
        .unwrap_or(MAX_DISTANCE)
        .min(MAX_DISTANCE);

    -((constant::DEFAULT_SCORE * (distance * 2)) / 1000)
}

/// A small bonus for fuzzy match scores, and a higher bonus for exact match scores.
///
/// Broadly these are arbitrary, but the bonus should be enough that exact (and similar) matches
/// are scored higher than those which are only loosely matched.
pub fn calculate_fuzzy_match_bonus(fuzzy_match: frizbee::Match) -> i64 {
    match fuzzy_match {
        fuzzy_match if fuzzy_match.exact => {
            let score = i64::from(fuzzy_match.score);

            // Prevent exact matches from going above 6.5% of the score
            ((score / 4) * 2).min((DEFAULT_SCORE * 65) / 1000)
        }
        fuzzy_match => {
            let score = i64::from(fuzzy_match.score);

            // Prevent non-exact matches going above 5% of the score to prevent arbitrary inflation
            // of symbols from generic queries, which aren't exact matches.
            (score / 4).min((DEFAULT_SCORE * 50) / 1000)
        }
    }
}

/// A small penalty for constants defined in JavaScript and TypeScript which are camel case
/// (i.e. `const x = 1; const someVar = "hello";`).
///
/// By convention, constants are used as variables in functions, methods, and other constructs. These
/// types of constants are not all that important for searching - global constants defined in call
/// capitals are though (i.e. `const MAX_RETRIES = 10;`)
///
/// See: <https://www.w3schools.com/js/js_const.asp>
pub fn calculate_camel_case_constant_penalty(symbol: &models::resolved::ResolvedSymbol) -> i64 {
    if symbol.language != Language::TypeScript
        && symbol.language != Language::TypeScriptJsx
        && symbol.language != Language::Javascript
        && symbol.language != Language::JavascriptJsx
    {
        // Only applies to TypeScript and JavaScript
        return 0;
    }

    let mut characters = symbol.name.chars();

    let is_first_character_lowercase = characters.next().is_some_and(char::is_lowercase);
    let is_all_uppercase = !is_first_character_lowercase && characters.all(char::is_uppercase);

    if symbol.kind != SymbolKind::Constant || is_all_uppercase || !is_first_character_lowercase {
        // Not a constant, or it's SCREAMING_CASE (usually a global), or is PascalCase (usually a
        // construct - i.e. a component, etc)
        return 0;
    }

    // It's a camel case constant, so likely being used as a variable
    JS_TS_CAMEL_CASE_CONSTANT_PENALTY
}

/// Apply a bonus to symbols who's [`models::resolved::SymbolKind`] matches the intent
/// displayed by a query string.
///
/// This is largely a heuristic using common style conventions - i.e. constants in all caps,
/// functions in snake or pascal case, etc.
pub fn calculate_clear_intent_bonus(query: &str, symbol: &models::resolved::ResolvedSymbol) -> i64 {
    let has_uppercase = query.chars().any(char::is_uppercase);
    let has_lowercase = query.chars().any(char::is_lowercase);
    let has_underscores = query.chars().any(|c| c == '_');

    let is_length_for_clear_intent =
        query.len() >= constant::MIN_CLEAR_INTENT_QUERY_LENGTH as usize;
    let is_upper_and_lower_mix = has_uppercase && has_lowercase && !has_underscores;
    let is_snake_case = has_underscores && !has_uppercase;
    let is_screaming_case = has_uppercase && !has_lowercase;

    // Bonus for symbols where a particular query indicates clear intent to that symbol
    // kind. For example, queries in all uppercase prioritise constants, and queries which
    // could be pascal case prioritise structs/classes/etc.
    match symbol.kind {
        // i.e. `SOME_CONSTANT` or `WEIGHT`
        models::parsed::SymbolKind::Constant
        | models::parsed::SymbolKind::StaticField
        | models::parsed::SymbolKind::StaticVariable
        | models::parsed::SymbolKind::StaticDataMember
            if is_length_for_clear_intent && is_screaming_case =>
        {
            CLEAR_QUERY_INTENT_SYMBOL_KINDS_SCORE_BONUS
        }

        // i.e. `SpecialClass` or `NewClassInterface`
        models::parsed::SymbolKind::Struct
        | models::parsed::SymbolKind::Type
        | models::parsed::SymbolKind::TypeAlias
        | models::parsed::SymbolKind::Class
        | models::parsed::SymbolKind::Enum
        | models::parsed::SymbolKind::EnumMember
        | models::parsed::SymbolKind::Interface
        | models::parsed::SymbolKind::Trait
        | models::parsed::SymbolKind::Protocol
        | models::parsed::SymbolKind::Union
        | models::parsed::SymbolKind::Variable // Components as Arrow functions in TypeScript
            if is_length_for_clear_intent && is_upper_and_lower_mix =>
        {
            CLEAR_QUERY_INTENT_SYMBOL_KINDS_SCORE_BONUS
        }

        // i.e. `is_ready` or `isReady` or `IsReady`
        models::parsed::SymbolKind::Function
        | models::parsed::SymbolKind::Method
        | models::parsed::SymbolKind::Predicate
        | models::parsed::SymbolKind::TraitMethod
        | models::parsed::SymbolKind::ProtocolMethod
        | models::parsed::SymbolKind::AbstractMethod
        | models::parsed::SymbolKind::Getter
            if is_length_for_clear_intent
                && (is_snake_case || is_upper_and_lower_mix ) =>
        {
            CLEAR_QUERY_INTENT_SYMBOL_KINDS_SCORE_BONUS
        }

        // No bonus for anything without clear intent
        _ => 0,
    }
}

#[cfg(test)]
mod tests {
    use std::path::PathBuf;

    use rstest::rstest;

    use crate::models::{
        parsed::{Language, SymbolKind},
        resolved::{ResolvedSymbol, Score},
    };

    #[rstest]
    #[case(0, 0)]
    #[case(1, -2)]
    #[case(2, -4)]
    #[case(3, -6)]
    #[case(4, -8)]
    #[case(5, -10)]
    #[case(6, -12)]
    #[case(7, -12)]
    #[case(8, -12)]
    #[case(9, -12)]
    #[case(10, -12)]
    pub fn test_distance_weighting(#[case] distance: usize, #[case] expected_penalty: i64) {
        assert_eq!(
            expected_penalty,
            super::calculate_distance_score_penalty(distance)
        );
    }

    #[rstest]
    #[case("x", SymbolKind::Constant, Language::TypeScript, -45)]
    #[case("someVar", SymbolKind::Constant, Language::TypeScript, -45)]
    #[case("MAX_RETRIES", SymbolKind::Constant, Language::TypeScript, 0)]
    #[case("Vehicle", SymbolKind::Constant, Language::TypeScript, 0)]
    #[case("CakeMixer", SymbolKind::Constant, Language::TypeScript, 0)]
    #[case("batter", SymbolKind::Constant, Language::Javascript, -45)]
    #[case("latte", SymbolKind::Constant, Language::JavascriptJsx, -45)]
    #[case("MAX_ATTEMPTS", SymbolKind::Constant, Language::TypeScriptJsx, 0)]
    #[case("Vehicle", SymbolKind::Constant, Language::TypeScriptJsx, 0)]
    #[case("CakeMixer", SymbolKind::Constant, Language::JavascriptJsx, 0)]
    #[case("somethingSpecial", SymbolKind::Constant, Language::Rust, 0)]
    #[case("y", SymbolKind::Constant, Language::Lua, 0)]
    #[case("someFunc", SymbolKind::Function, Language::TypeScriptJsx, 0)]
    pub fn test_camel_case_const_penalty(
        #[case] name: &str,
        #[case] kind: SymbolKind,
        #[case] language: Language,
        #[case] expected_penalty: i64,
    ) {
        assert_eq!(
            expected_penalty,
            super::calculate_camel_case_constant_penalty(&ResolvedSymbol {
                id: 0,
                name: name.to_string(),
                kind,
                language,
                path: PathBuf::new(),
                score: Score::default(),
                start_line: 0,
                end_line: 0,
                start_column: 0,
                end_column: 0,
            })
        );
    }
}