use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum VarType {
Iterator, Counter, Temporary, Configuration, Resource, Data, Other, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum AccessType {
Getter, Setter, Chained, Collection, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum CallType {
Getter, Setter, Validator, Converter, IO, ErrorHandle, Collection, External, Other, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum FlowType {
If,
Match,
Loop,
While,
For,
Return,
Break,
Continue,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ErrorType {
Result,
Option,
Unwrap,
Expect,
QuestionMark,
MapErr,
AndThen,
OrElse,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum CollectionOp {
Iteration, Mapping, Filtering, Aggregation, Access, Mutation, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LiteralCategory {
Numeric,
String,
Boolean,
Char,
Null,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TokenClass {
LocalVar(VarType),
FieldAccess(AccessType),
MethodCall(CallType),
ExternalAPI(String), ControlFlow(FlowType),
ErrorHandling(ErrorType),
Collection(CollectionOp),
Literal(LiteralCategory),
Keyword(String),
Operator(String),
Unknown(String),
}
#[derive(Debug, Clone)]
pub struct TokenContext {
pub is_method_call: bool,
pub is_field_access: bool,
pub is_external: bool,
pub scope_depth: usize,
pub parent_node_type: NodeType,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NodeType {
Function,
Method,
Closure,
Block,
Expression,
Statement,
Pattern,
Type,
}
#[derive(Debug, Clone)]
pub struct ClassificationConfig {
pub enabled: bool,
pub weights: HashMap<TokenClass, f64>,
pub cache_size: usize,
}
impl Default for ClassificationConfig {
fn default() -> Self {
let mut weights = HashMap::new();
weights.insert(TokenClass::LocalVar(VarType::Iterator), 0.1);
weights.insert(TokenClass::LocalVar(VarType::Counter), 0.2);
weights.insert(TokenClass::LocalVar(VarType::Temporary), 0.3);
weights.insert(TokenClass::LocalVar(VarType::Configuration), 0.5);
weights.insert(TokenClass::LocalVar(VarType::Resource), 0.7);
weights.insert(TokenClass::LocalVar(VarType::Data), 0.5);
weights.insert(TokenClass::LocalVar(VarType::Other), 0.4);
weights.insert(TokenClass::FieldAccess(AccessType::Getter), 0.3);
weights.insert(TokenClass::FieldAccess(AccessType::Setter), 0.4);
weights.insert(TokenClass::FieldAccess(AccessType::Chained), 0.6);
weights.insert(TokenClass::FieldAccess(AccessType::Collection), 0.5);
weights.insert(TokenClass::MethodCall(CallType::Getter), 0.2);
weights.insert(TokenClass::MethodCall(CallType::Setter), 0.3);
weights.insert(TokenClass::MethodCall(CallType::Validator), 0.4);
weights.insert(TokenClass::MethodCall(CallType::Converter), 0.5);
weights.insert(TokenClass::MethodCall(CallType::IO), 0.9);
weights.insert(TokenClass::MethodCall(CallType::ErrorHandle), 0.7);
weights.insert(TokenClass::MethodCall(CallType::Collection), 0.4);
weights.insert(TokenClass::MethodCall(CallType::External), 1.0);
weights.insert(TokenClass::MethodCall(CallType::Other), 0.6);
weights.insert(TokenClass::ControlFlow(FlowType::If), 0.5);
weights.insert(TokenClass::ControlFlow(FlowType::Match), 0.6);
weights.insert(TokenClass::ControlFlow(FlowType::Loop), 0.7);
weights.insert(TokenClass::ControlFlow(FlowType::While), 0.7);
weights.insert(TokenClass::ControlFlow(FlowType::For), 0.6);
weights.insert(TokenClass::ControlFlow(FlowType::Return), 0.3);
weights.insert(TokenClass::ControlFlow(FlowType::Break), 0.4);
weights.insert(TokenClass::ControlFlow(FlowType::Continue), 0.4);
weights.insert(TokenClass::ErrorHandling(ErrorType::Result), 0.6);
weights.insert(TokenClass::ErrorHandling(ErrorType::Option), 0.5);
weights.insert(TokenClass::ErrorHandling(ErrorType::Unwrap), 0.8);
weights.insert(TokenClass::ErrorHandling(ErrorType::Expect), 0.8);
weights.insert(TokenClass::ErrorHandling(ErrorType::QuestionMark), 0.4);
weights.insert(TokenClass::ErrorHandling(ErrorType::MapErr), 0.6);
weights.insert(TokenClass::ErrorHandling(ErrorType::AndThen), 0.5);
weights.insert(TokenClass::ErrorHandling(ErrorType::OrElse), 0.5);
weights.insert(TokenClass::Collection(CollectionOp::Iteration), 0.3);
weights.insert(TokenClass::Collection(CollectionOp::Mapping), 0.5);
weights.insert(TokenClass::Collection(CollectionOp::Filtering), 0.5);
weights.insert(TokenClass::Collection(CollectionOp::Aggregation), 0.7);
weights.insert(TokenClass::Collection(CollectionOp::Access), 0.4);
weights.insert(TokenClass::Collection(CollectionOp::Mutation), 0.6);
weights.insert(TokenClass::Literal(LiteralCategory::Numeric), 0.1);
weights.insert(TokenClass::Literal(LiteralCategory::String), 0.2);
weights.insert(TokenClass::Literal(LiteralCategory::Boolean), 0.1);
weights.insert(TokenClass::Literal(LiteralCategory::Char), 0.1);
weights.insert(TokenClass::Literal(LiteralCategory::Null), 0.1);
Self {
enabled: false, weights,
cache_size: 10000,
}
}
}
#[derive(Debug)]
pub struct TokenClassifier {
config: ClassificationConfig,
cache: HashMap<(String, bool, bool), TokenClass>,
}
const ITERATOR_VARS: &[&str] = &["i", "j", "k", "n", "idx", "index", "iter", "it", "cursor"];
const COUNTER_PARTS: &[&str] = &["count", "num", "total"];
const TEMPORARY_VARS: &[&str] = &["temp", "tmp", "result", "res", "ret", "val"];
const CONFIG_PARTS: &[&str] = &["config", "setting", "option", "param"];
const RESOURCE_PARTS: &[&str] = &["file", "conn", "client", "socket", "stream", "handle"];
const DATA_PARTS: &[&str] = &["data", "value", "item", "element", "node", "entry"];
const LOCAL_VAR_RULES: &[(VarType, LocalVarPattern)] = &[
(VarType::Iterator, LocalVarPattern::Exact(ITERATOR_VARS)),
(VarType::Counter, LocalVarPattern::Contains(COUNTER_PARTS)),
(VarType::Temporary, LocalVarPattern::Exact(TEMPORARY_VARS)),
(
VarType::Configuration,
LocalVarPattern::Contains(CONFIG_PARTS),
),
(VarType::Resource, LocalVarPattern::Contains(RESOURCE_PARTS)),
(VarType::Data, LocalVarPattern::Contains(DATA_PARTS)),
];
enum LocalVarPattern {
Exact(&'static [&'static str]),
Contains(&'static [&'static str]),
}
impl LocalVarPattern {
fn matches(&self, token: &str) -> bool {
match self {
Self::Exact(names) => names.contains(&token),
Self::Contains(parts) => parts.iter().any(|part| token.contains(part)),
}
}
}
fn classify_control_flow(token: &str) -> Option<TokenClass> {
match token {
"if" | "else" | "elif" => Some(TokenClass::ControlFlow(FlowType::If)),
"match" => Some(TokenClass::ControlFlow(FlowType::Match)),
"loop" => Some(TokenClass::ControlFlow(FlowType::Loop)),
"while" => Some(TokenClass::ControlFlow(FlowType::While)),
"for" => Some(TokenClass::ControlFlow(FlowType::For)),
"return" => Some(TokenClass::ControlFlow(FlowType::Return)),
"break" => Some(TokenClass::ControlFlow(FlowType::Break)),
"continue" => Some(TokenClass::ControlFlow(FlowType::Continue)),
_ => None,
}
}
fn classify_method_call_context(
classifier: &TokenClassifier,
token: &str,
context: &TokenContext,
) -> Option<TokenClass> {
context
.is_method_call
.then(|| classifier.classify_method_call(token, context))
}
fn classify_field_access_context(
classifier: &TokenClassifier,
token: &str,
context: &TokenContext,
) -> Option<TokenClass> {
context
.is_field_access
.then(|| classifier.classify_field_access(token, context))
}
fn classify_local_var_context(
classifier: &TokenClassifier,
token: &str,
context: &TokenContext,
) -> Option<TokenClass> {
(!context.is_external && is_identifier_token(token))
.then(|| classifier.classify_local_var(token))
}
fn is_identifier_token(token: &str) -> bool {
token.chars().all(|c| c.is_alphanumeric() || c == '_')
}
fn classify_literal(token: &str) -> Option<TokenClass> {
if token.parse::<f64>().is_ok() {
Some(TokenClass::Literal(LiteralCategory::Numeric))
} else {
classify_named_literal(token)
}
}
fn classify_named_literal(token: &str) -> Option<TokenClass> {
match token {
"true" | "false" => Some(TokenClass::Literal(LiteralCategory::Boolean)),
"null" | "None" | "nil" => Some(TokenClass::Literal(LiteralCategory::Null)),
_ if is_string_literal(token) => Some(TokenClass::Literal(LiteralCategory::String)),
_ if is_char_literal(token) => Some(TokenClass::Literal(LiteralCategory::Char)),
_ => None,
}
}
fn is_string_literal(token: &str) -> bool {
token.starts_with('"') && token.ends_with('"')
}
fn is_char_literal(token: &str) -> bool {
token.starts_with('\'') && token.ends_with('\'') && token.len() == 3
}
fn classify_keyword(token: &str) -> Option<TokenClass> {
is_keyword(token).then(|| TokenClass::Keyword(token.to_string()))
}
fn is_keyword(token: &str) -> bool {
matches!(
token,
"fn" | "let"
| "const"
| "mut"
| "pub"
| "struct"
| "enum"
| "trait"
| "impl"
| "mod"
| "use"
| "async"
| "await"
| "self"
| "Self"
)
}
fn classify_operator(token: &str) -> Option<TokenClass> {
is_operator(token).then(|| TokenClass::Operator(token.to_string()))
}
fn is_operator(token: &str) -> bool {
token.chars().all(|c| "+-*/%=<>!&|^~?.".contains(c))
}
impl TokenClassifier {
pub fn new(config: ClassificationConfig) -> Self {
Self {
config,
cache: HashMap::new(),
}
}
pub fn classify(&mut self, token: &str, context: &TokenContext) -> TokenClass {
if !self.config.enabled {
return TokenClass::Unknown(token.to_string());
}
let cache_key = (
token.to_string(),
context.is_method_call,
context.is_field_access,
);
if let Some(cached) = self.cache.get(&cache_key) {
return cached.clone();
}
let class = self.classify_internal(token, context);
if self.cache.len() < self.config.cache_size {
self.cache.insert(cache_key, class.clone());
}
class
}
fn classify_internal(&self, token: &str, context: &TokenContext) -> TokenClass {
self.classify_contextual_token(token, context)
.or_else(|| classify_literal(token))
.or_else(|| classify_keyword(token))
.or_else(|| classify_operator(token))
.unwrap_or_else(|| TokenClass::Unknown(token.to_string()))
}
fn classify_contextual_token(&self, token: &str, context: &TokenContext) -> Option<TokenClass> {
classify_control_flow(token)
.or_else(|| classify_method_call_context(self, token, context))
.or_else(|| classify_field_access_context(self, token, context))
.or_else(|| classify_local_var_context(self, token, context))
}
fn classify_method_call(&self, token: &str, context: &TokenContext) -> TokenClass {
let lower = token.to_lowercase();
if lower.starts_with("get_") || lower.ends_with("_ref") || lower.starts_with("as_") {
return TokenClass::MethodCall(CallType::Getter);
}
if lower.starts_with("set_") || lower.starts_with("with_") {
return TokenClass::MethodCall(CallType::Setter);
}
if lower.starts_with("is_")
|| lower.starts_with("has_")
|| lower.starts_with("can_")
|| lower.starts_with("should_")
{
return TokenClass::MethodCall(CallType::Validator);
}
if lower.starts_with("to_")
|| lower.starts_with("into_")
|| lower.starts_with("from_")
|| lower == "parse"
{
return TokenClass::MethodCall(CallType::Converter);
}
if matches!(
lower.as_str(),
"read"
| "write"
| "send"
| "receive"
| "recv"
| "flush"
| "sync"
| "open"
| "close"
| "connect"
) {
return TokenClass::MethodCall(CallType::IO);
}
if matches!(
lower.as_str(),
"unwrap" | "expect" | "map_err" | "ok" | "err" | "and_then" | "or_else" | "unwrap_or"
) {
return TokenClass::MethodCall(CallType::ErrorHandle);
}
if matches!(
lower.as_str(),
"push"
| "pop"
| "insert"
| "remove"
| "clear"
| "len"
| "is_empty"
| "contains"
| "get"
| "iter"
| "map"
| "filter"
| "fold"
| "collect"
| "sort"
) {
return TokenClass::MethodCall(CallType::Collection);
}
if context.is_external {
return TokenClass::MethodCall(CallType::External);
}
TokenClass::MethodCall(CallType::Other)
}
fn classify_field_access(&self, _token: &str, _context: &TokenContext) -> TokenClass {
TokenClass::FieldAccess(AccessType::Getter)
}
fn classify_local_var(&self, token: &str) -> TokenClass {
let lower = token.to_lowercase();
LOCAL_VAR_RULES
.iter()
.find(|(_, pattern)| pattern.matches(&lower))
.map(|(var_type, _)| TokenClass::LocalVar(var_type.clone()))
.unwrap_or(TokenClass::LocalVar(VarType::Other))
}
pub fn get_weight(&self, class: &TokenClass) -> f64 {
self.config.weights.get(class).copied().unwrap_or(0.5)
}
pub fn update_weights(&mut self, weights: HashMap<TokenClass, f64>) {
self.config.weights = weights;
}
pub fn clear_cache(&mut self) {
self.cache.clear();
}
}
#[derive(Debug, Clone)]
pub struct ClassifiedToken {
pub class: TokenClass,
pub raw_token: String,
pub context: TokenContext,
pub weight: f64,
}
impl ClassifiedToken {
pub fn new(class: TokenClass, raw_token: String, context: TokenContext, weight: f64) -> Self {
Self {
class,
raw_token,
context,
weight,
}
}
}