use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum VarType {
Iterator, Counter, Temporary, Configuration, Resource, Data, Other, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum AccessType {
Getter, Setter, Chained, Collection, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum CallType {
Getter, Setter, Validator, Converter, IO, ErrorHandle, Collection, External, Other, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum FlowType {
If,
Match,
Loop,
While,
For,
Return,
Break,
Continue,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ErrorType {
Result,
Option,
Unwrap,
Expect,
QuestionMark,
MapErr,
AndThen,
OrElse,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum CollectionOp {
Iteration, Mapping, Filtering, Aggregation, Access, Mutation, }
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LiteralCategory {
Numeric,
String,
Boolean,
Char,
Null,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TokenClass {
LocalVar(VarType),
FieldAccess(AccessType),
MethodCall(CallType),
ExternalAPI(String), ControlFlow(FlowType),
ErrorHandling(ErrorType),
Collection(CollectionOp),
Literal(LiteralCategory),
Keyword(String),
Operator(String),
Unknown(String),
}
#[derive(Debug, Clone)]
pub struct TokenContext {
pub is_method_call: bool,
pub is_field_access: bool,
pub is_external: bool,
pub scope_depth: usize,
pub parent_node_type: NodeType,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NodeType {
Function,
Method,
Closure,
Block,
Expression,
Statement,
Pattern,
Type,
}
#[derive(Debug, Clone)]
pub struct ClassificationConfig {
pub enabled: bool,
pub weights: HashMap<TokenClass, f64>,
pub cache_size: usize,
}
impl Default for ClassificationConfig {
fn default() -> Self {
let mut weights = HashMap::new();
weights.insert(TokenClass::LocalVar(VarType::Iterator), 0.1);
weights.insert(TokenClass::LocalVar(VarType::Counter), 0.2);
weights.insert(TokenClass::LocalVar(VarType::Temporary), 0.3);
weights.insert(TokenClass::LocalVar(VarType::Configuration), 0.5);
weights.insert(TokenClass::LocalVar(VarType::Resource), 0.7);
weights.insert(TokenClass::LocalVar(VarType::Data), 0.5);
weights.insert(TokenClass::LocalVar(VarType::Other), 0.4);
weights.insert(TokenClass::FieldAccess(AccessType::Getter), 0.3);
weights.insert(TokenClass::FieldAccess(AccessType::Setter), 0.4);
weights.insert(TokenClass::FieldAccess(AccessType::Chained), 0.6);
weights.insert(TokenClass::FieldAccess(AccessType::Collection), 0.5);
weights.insert(TokenClass::MethodCall(CallType::Getter), 0.2);
weights.insert(TokenClass::MethodCall(CallType::Setter), 0.3);
weights.insert(TokenClass::MethodCall(CallType::Validator), 0.4);
weights.insert(TokenClass::MethodCall(CallType::Converter), 0.5);
weights.insert(TokenClass::MethodCall(CallType::IO), 0.9);
weights.insert(TokenClass::MethodCall(CallType::ErrorHandle), 0.7);
weights.insert(TokenClass::MethodCall(CallType::Collection), 0.4);
weights.insert(TokenClass::MethodCall(CallType::External), 1.0);
weights.insert(TokenClass::MethodCall(CallType::Other), 0.6);
weights.insert(TokenClass::ControlFlow(FlowType::If), 0.5);
weights.insert(TokenClass::ControlFlow(FlowType::Match), 0.6);
weights.insert(TokenClass::ControlFlow(FlowType::Loop), 0.7);
weights.insert(TokenClass::ControlFlow(FlowType::While), 0.7);
weights.insert(TokenClass::ControlFlow(FlowType::For), 0.6);
weights.insert(TokenClass::ControlFlow(FlowType::Return), 0.3);
weights.insert(TokenClass::ControlFlow(FlowType::Break), 0.4);
weights.insert(TokenClass::ControlFlow(FlowType::Continue), 0.4);
weights.insert(TokenClass::ErrorHandling(ErrorType::Result), 0.6);
weights.insert(TokenClass::ErrorHandling(ErrorType::Option), 0.5);
weights.insert(TokenClass::ErrorHandling(ErrorType::Unwrap), 0.8);
weights.insert(TokenClass::ErrorHandling(ErrorType::Expect), 0.8);
weights.insert(TokenClass::ErrorHandling(ErrorType::QuestionMark), 0.4);
weights.insert(TokenClass::ErrorHandling(ErrorType::MapErr), 0.6);
weights.insert(TokenClass::ErrorHandling(ErrorType::AndThen), 0.5);
weights.insert(TokenClass::ErrorHandling(ErrorType::OrElse), 0.5);
weights.insert(TokenClass::Collection(CollectionOp::Iteration), 0.3);
weights.insert(TokenClass::Collection(CollectionOp::Mapping), 0.5);
weights.insert(TokenClass::Collection(CollectionOp::Filtering), 0.5);
weights.insert(TokenClass::Collection(CollectionOp::Aggregation), 0.7);
weights.insert(TokenClass::Collection(CollectionOp::Access), 0.4);
weights.insert(TokenClass::Collection(CollectionOp::Mutation), 0.6);
weights.insert(TokenClass::Literal(LiteralCategory::Numeric), 0.1);
weights.insert(TokenClass::Literal(LiteralCategory::String), 0.2);
weights.insert(TokenClass::Literal(LiteralCategory::Boolean), 0.1);
weights.insert(TokenClass::Literal(LiteralCategory::Char), 0.1);
weights.insert(TokenClass::Literal(LiteralCategory::Null), 0.1);
Self {
enabled: false, weights,
cache_size: 10000,
}
}
}
#[derive(Debug)]
pub struct TokenClassifier {
config: ClassificationConfig,
cache: HashMap<(String, bool, bool), TokenClass>,
}
impl TokenClassifier {
pub fn new(config: ClassificationConfig) -> Self {
Self {
config,
cache: HashMap::new(),
}
}
pub fn classify(&mut self, token: &str, context: &TokenContext) -> TokenClass {
if !self.config.enabled {
return TokenClass::Unknown(token.to_string());
}
let cache_key = (
token.to_string(),
context.is_method_call,
context.is_field_access,
);
if let Some(cached) = self.cache.get(&cache_key) {
return cached.clone();
}
let class = self.classify_internal(token, context);
if self.cache.len() < self.config.cache_size {
self.cache.insert(cache_key, class.clone());
}
class
}
fn classify_internal(&self, token: &str, context: &TokenContext) -> TokenClass {
if matches!(token, "if" | "else" | "elif") {
return TokenClass::ControlFlow(FlowType::If);
}
if token == "match" {
return TokenClass::ControlFlow(FlowType::Match);
}
if token == "loop" {
return TokenClass::ControlFlow(FlowType::Loop);
}
if token == "while" {
return TokenClass::ControlFlow(FlowType::While);
}
if token == "for" {
return TokenClass::ControlFlow(FlowType::For);
}
if token == "return" {
return TokenClass::ControlFlow(FlowType::Return);
}
if token == "break" {
return TokenClass::ControlFlow(FlowType::Break);
}
if token == "continue" {
return TokenClass::ControlFlow(FlowType::Continue);
}
if context.is_method_call {
return self.classify_method_call(token, context);
}
if context.is_field_access {
return self.classify_field_access(token, context);
}
if !context.is_external && token.chars().all(|c| c.is_alphanumeric() || c == '_') {
return self.classify_local_var(token);
}
if token.parse::<f64>().is_ok() {
return TokenClass::Literal(LiteralCategory::Numeric);
}
if token == "true" || token == "false" {
return TokenClass::Literal(LiteralCategory::Boolean);
}
if token.starts_with('"') && token.ends_with('"') {
return TokenClass::Literal(LiteralCategory::String);
}
if token.starts_with('\'') && token.ends_with('\'') && token.len() == 3 {
return TokenClass::Literal(LiteralCategory::Char);
}
if token == "null" || token == "None" || token == "nil" {
return TokenClass::Literal(LiteralCategory::Null);
}
if matches!(
token,
"fn" | "let"
| "const"
| "mut"
| "pub"
| "struct"
| "enum"
| "trait"
| "impl"
| "mod"
| "use"
| "async"
| "await"
| "self"
| "Self"
) {
return TokenClass::Keyword(token.to_string());
}
if token.chars().all(|c| "+-*/%=<>!&|^~?.".contains(c)) {
return TokenClass::Operator(token.to_string());
}
TokenClass::Unknown(token.to_string())
}
fn classify_method_call(&self, token: &str, context: &TokenContext) -> TokenClass {
let lower = token.to_lowercase();
if lower.starts_with("get_") || lower.ends_with("_ref") || lower.starts_with("as_") {
return TokenClass::MethodCall(CallType::Getter);
}
if lower.starts_with("set_") || lower.starts_with("with_") {
return TokenClass::MethodCall(CallType::Setter);
}
if lower.starts_with("is_")
|| lower.starts_with("has_")
|| lower.starts_with("can_")
|| lower.starts_with("should_")
{
return TokenClass::MethodCall(CallType::Validator);
}
if lower.starts_with("to_")
|| lower.starts_with("into_")
|| lower.starts_with("from_")
|| lower == "parse"
{
return TokenClass::MethodCall(CallType::Converter);
}
if matches!(
lower.as_str(),
"read"
| "write"
| "send"
| "receive"
| "recv"
| "flush"
| "sync"
| "open"
| "close"
| "connect"
) {
return TokenClass::MethodCall(CallType::IO);
}
if matches!(
lower.as_str(),
"unwrap" | "expect" | "map_err" | "ok" | "err" | "and_then" | "or_else" | "unwrap_or"
) {
return TokenClass::MethodCall(CallType::ErrorHandle);
}
if matches!(
lower.as_str(),
"push"
| "pop"
| "insert"
| "remove"
| "clear"
| "len"
| "is_empty"
| "contains"
| "get"
| "iter"
| "map"
| "filter"
| "fold"
| "collect"
| "sort"
) {
return TokenClass::MethodCall(CallType::Collection);
}
if context.is_external {
return TokenClass::MethodCall(CallType::External);
}
TokenClass::MethodCall(CallType::Other)
}
fn classify_field_access(&self, _token: &str, _context: &TokenContext) -> TokenClass {
TokenClass::FieldAccess(AccessType::Getter)
}
fn classify_local_var(&self, token: &str) -> TokenClass {
let lower = token.to_lowercase();
if matches!(
lower.as_str(),
"i" | "j" | "k" | "n" | "idx" | "index" | "iter" | "it" | "cursor"
) {
return TokenClass::LocalVar(VarType::Iterator);
}
if lower.contains("count") || lower.contains("num") || lower.contains("total") {
return TokenClass::LocalVar(VarType::Counter);
}
if matches!(
lower.as_str(),
"temp" | "tmp" | "result" | "res" | "ret" | "val"
) {
return TokenClass::LocalVar(VarType::Temporary);
}
if lower.contains("config")
|| lower.contains("setting")
|| lower.contains("option")
|| lower.contains("param")
{
return TokenClass::LocalVar(VarType::Configuration);
}
if lower.contains("file")
|| lower.contains("conn")
|| lower.contains("client")
|| lower.contains("socket")
|| lower.contains("stream")
|| lower.contains("handle")
{
return TokenClass::LocalVar(VarType::Resource);
}
if lower.contains("data")
|| lower.contains("value")
|| lower.contains("item")
|| lower.contains("element")
|| lower.contains("node")
|| lower.contains("entry")
{
return TokenClass::LocalVar(VarType::Data);
}
TokenClass::LocalVar(VarType::Other)
}
pub fn get_weight(&self, class: &TokenClass) -> f64 {
self.config.weights.get(class).copied().unwrap_or(0.5)
}
pub fn update_weights(&mut self, weights: HashMap<TokenClass, f64>) {
self.config.weights = weights;
}
pub fn clear_cache(&mut self) {
self.cache.clear();
}
}
#[derive(Debug, Clone)]
pub struct ClassifiedToken {
pub class: TokenClass,
pub raw_token: String,
pub context: TokenContext,
pub weight: f64,
}
impl ClassifiedToken {
pub fn new(class: TokenClass, raw_token: String, context: TokenContext, weight: f64) -> Self {
Self {
class,
raw_token,
context,
weight,
}
}
}