oak_core/language/mod.rs
1use std::{fmt::Debug, hash::Hash};
2
3/// Represents the broad category a language belongs to.
4///
5/// Categories are used by the framework to apply default behaviors,
6/// such as choosing appropriate lexer/parser strategies or selecting
7/// default themes for syntax highlighting.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
9#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
10pub enum LanguageCategory {
11 /// General-purpose programming languages (e.g., Rust, C, Java).
12 Programming,
13 /// Markup and document languages (e.g., Markdown, HTML, Typst).
14 Markup,
15 /// Configuration and data serialization languages (e.g., YAML, JSON, TOML).
16 Config,
17 /// Styling languages (e.g., CSS, Sass, Less).
18 StyleSheet,
19 /// Domain-specific languages or specialized notation (e.g., SQL, Regex, Math).
20 Dsl,
21 /// Modeling languages (e.g., UML, Mermaid, PlantUML).
22 Modeling,
23 /// Other or unclassified.
24 Other,
25}
26
27/// Language definition trait that coordinates all language-related types and behaviors.
28///
29/// This trait ties together language-specific components like lexers, parsers, and ASTs.
30/// It enables the framework to be language-agnostic while providing type-safe
31/// language-specific functionality.
32pub trait Language: Send + Sync {
33 /// The name of the language (e.g., "rust", "sql").
34 const NAME: &'static str;
35
36 /// The category of the language.
37 const CATEGORY: LanguageCategory = LanguageCategory::Programming;
38
39 /// The token type used to represent different token and node types in the language.
40 ///
41 /// This associated type defines how different syntactic elements (tokens, nodes) are
42 /// categorized and identified within the language. It must implement `Copy` and `Eq`
43 /// to ensure efficient handling in the parsing system.
44 ///
45 /// # Requirements
46 ///
47 /// The token type must:
48 /// - Implement the `TokenType` trait
49 /// - Be copyable to enable efficient passing
50 /// - Support equality comparison for token matching
51 /// - Be sendable across thread boundaries
52 ///
53 /// # Examples
54 ///
55 /// ```
56 /// #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
57 /// enum RustSyntaxKind {
58 /// LetKeyword,
59 /// Identifier,
60 /// Number,
61 /// // ... other token kinds
62 /// }
63 /// ```
64 type TokenType: TokenType;
65
66 /// The element type used to represent composite structures in the parsed tree.
67 ///
68 /// While tokens represent the atomic units of the language, elements represent
69 /// the composite structures formed by combining tokens according to grammar rules.
70 /// This includes expressions, statements, declarations, and other syntactic constructs.
71 ///
72 /// # Requirements
73 ///
74 /// The element type must:
75 /// - Implement the `ElementType` trait
76 /// - Be copyable for efficient handling
77 /// - Support equality comparison
78 /// - Be sendable across thread boundaries
79 type ElementType: ElementType;
80
81 /// The root type for the parsed tree that represents the top-level structure of the language.
82 ///
83 /// This associated type defines the structure of the root node in the parsed tree,
84 /// which typically contains the entire parsed source code organized according to the
85 /// language's grammar rules. The root type serves as the entry point for traversing
86 /// and manipulating the parsed representation.
87 ///
88 /// # Design Considerations
89 ///
90 /// The root type should:
91 /// - Contain references to all top-level language constructs
92 /// - Provide efficient access to the parsed content
93 /// - Support incremental updates when the source changes
94 ///
95 /// # Examples
96 ///
97 /// ```ignore
98 /// struct RustRoot {
99 /// items: Vec<RustItem>,
100 /// }
101 ///
102 /// struct RustRoot {
103 /// modules: Vec<Module>,
104 /// imports: Vec<Import>,
105 /// declarations: Vec<Declaration>,
106 /// }
107 /// ```
108 type TypedRoot;
109}
110
111/// Token type definitions for tokens in the parsing system.
112///
113/// This trait serves as the foundation for defining different types of tokens.
114/// By mapping language-specific kinds to [`UniversalTokenRole`], generic tools
115/// like highlighters and formatters can work across many languages.
116macro_rules! define_token_type {
117 ($($bound:tt)*) => {
118 /// A trait for types that represent a token's kind in a specific language.
119 pub trait TokenType: Copy + Eq + Hash + Send + Sync + std::fmt::Debug $($bound)* {
120 /// The associated role type for this token kind.
121 type Role: TokenRole;
122
123 /// A constant representing the end of the input stream.
124 const END_OF_STREAM: Self;
125
126 /// Returns the general syntactic role of this token.
127 ///
128 /// The role determines how the token is treated by generic language tools.
129 /// For example, tokens with [`UniversalTokenRole::Keyword`] are typically
130 /// highlighted using a specific theme color, regardless of the language.
131 fn role(&self) -> Self::Role;
132
133 /// Returns true if this token matches the specified language-specific role.
134 fn is_role(&self, role: Self::Role) -> bool {
135 self.role() == role
136 }
137
138 /// Returns true if this token matches the specified universal role.
139 fn is_universal(&self, role: UniversalTokenRole) -> bool {
140 self.role().universal() == role
141 }
142
143 /// Returns true if this token represents a comment.
144 fn is_comment(&self) -> bool {
145 self.is_universal(UniversalTokenRole::Comment)
146 }
147
148 /// Returns true if this token represents whitespace.
149 fn is_whitespace(&self) -> bool {
150 self.is_universal(UniversalTokenRole::Whitespace)
151 }
152
153 /// Returns true if this token represents an error condition.
154 fn is_error(&self) -> bool {
155 self.is_universal(UniversalTokenRole::Error)
156 }
157
158 /// Returns true if this token represents trivia (whitespace, comments, etc.).
159 fn is_ignored(&self) -> bool {
160 self.is_whitespace() || self.is_comment()
161 }
162
163 /// Returns true if this token represents the end of the input stream.
164 fn is_end_of_stream(&self) -> bool {
165 *self == Self::END_OF_STREAM
166 }
167 }
168 };
169}
170
171define_token_type!();
172
173/// A trait for types that can represent a token's syntactic role.
174pub trait TokenRole: Copy + Eq + Send {
175 /// Maps this role to a universal, language-agnostic role.
176 fn universal(&self) -> UniversalTokenRole;
177
178 /// Returns a specific name for this role, used for granular highlighting.
179 ///
180 /// For universal roles, this should return the standard scope name (e.g., "keyword").
181 /// For language-specific roles, it can return more specific names (e.g., "keyword.control").
182 fn name(&self) -> &str;
183}
184
185/// Represents the general syntactic role of a token across diverse languages.
186///
187/// By mapping to these roles, generic tools can identify names, literals, or operators
188/// across 100+ languages without needing to learn the specifics of each grammar.
189#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
190#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
191#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))]
192pub enum UniversalTokenRole {
193 /// Language reserved words or built-in commands (e.g., 'SELECT', 'let', 'MOV').
194 Keyword,
195 /// Identifiers, labels, keys, tags, or any name-like token.
196 Name,
197 /// Literal values like strings, numbers, booleans, or nulls.
198 Literal,
199 /// An escape sequence or a special character representation within a literal.
200 Escape,
201 /// Mathematical, logical, or structural operators (e.g., '+', '=>', 'LIKE').
202 Operator,
203 /// Structural characters like brackets, commas, semicolons.
204 Punctuation,
205 /// Developer annotations or documentation.
206 Comment,
207 /// Formatting characters like spaces or tabs.
208 Whitespace,
209 /// Malformed or unrecognized content.
210 Error,
211 /// No specific role assigned.
212 None,
213 /// End of stream marker.
214 Eof,
215}
216
217impl TokenRole for UniversalTokenRole {
218 fn universal(&self) -> UniversalTokenRole {
219 *self
220 }
221
222 fn name(&self) -> &str {
223 match *self {
224 UniversalTokenRole::Keyword => "keyword",
225 UniversalTokenRole::Name => "variable.other",
226 UniversalTokenRole::Literal => "constant",
227 UniversalTokenRole::Escape => "constant.character.escape",
228 UniversalTokenRole::Operator => "keyword.operator",
229 UniversalTokenRole::Punctuation => "punctuation",
230 UniversalTokenRole::Comment => "comment",
231 UniversalTokenRole::Whitespace => "punctuation.whitespace",
232 UniversalTokenRole::Error => "invalid",
233 UniversalTokenRole::None => "none",
234 UniversalTokenRole::Eof => "punctuation.eof",
235 }
236 }
237}
238
239/// Element type definitions for nodes in the parsed tree.
240///
241/// While tokens represent the atomic units of a language, elements represent the
242/// composite structures formed by combining tokens according to grammar rules.
243/// This includes expressions, statements, declarations, and other syntactic constructs.
244///
245/// # Implementation Guidelines
246///
247/// When implementing this trait for a specific language:
248/// - Use an enum with discriminant values for efficient matching
249/// - Include a Root variant to identify the top-level element
250/// - Include an Error variant for malformed constructs
251/// - Define a `Role` associated type and implement the `role()` method.
252///
253/// # Examples
254///
255/// ```ignore
256/// #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
257/// enum MyElement {
258/// Root,
259/// FunctionDeclaration,
260/// Block,
261/// Error,
262/// }
263///
264/// impl ElementType for MyElement {
265/// type Role = UniversalElementRole;
266///
267/// fn role(&self) -> Self::Role {
268/// match self {
269/// MyElement::Root => UniversalElementRole::Root,
270/// MyElement::FunctionDeclaration => UniversalElementRole::Binding,
271/// MyElement::Block => UniversalElementRole::Container,
272/// MyElement::Error => UniversalElementRole::Error,
273/// }
274/// }
275///
276/// fn is_root(&self) -> bool {
277/// matches!(self, MyElement::Root)
278/// }
279///
280/// fn is_error(&self) -> bool {
281/// matches!(self, MyElement::Error)
282/// }
283/// }
284/// ```
285macro_rules! define_element_type {
286 ($($bound:tt)*) => {
287 /// A trait for types that represent an element's kind in a syntax tree.
288 pub trait ElementType: Copy + Eq + Hash + Send + Sync + std::fmt::Debug $($bound)* {
289 /// The associated role type for this element kind.
290 type Role: ElementRole;
291
292 /// Returns the general syntactic role of this element.
293 ///
294 /// The role enables structural analysis without knowing the specific grammar.
295 /// For example, a tool can find all [`UniversalElementRole::Definition`] nodes
296 /// to build a symbol outline for any supported language.
297 fn role(&self) -> Self::Role;
298
299 /// Returns true if this element matches the specified language-specific role.
300 fn is_role(&self, role: Self::Role) -> bool {
301 self.role() == role
302 }
303
304 /// Returns true if this element matches the specified universal role.
305 fn is_universal(&self, role: UniversalElementRole) -> bool {
306 self.role().universal() == role
307 }
308
309 /// Returns true if this element represents the root of the parsed tree.
310 fn is_root(&self) -> bool {
311 self.is_universal(UniversalElementRole::Root)
312 }
313
314 /// Returns true if this element represents an error condition.
315 fn is_error(&self) -> bool {
316 self.is_universal(UniversalElementRole::Error)
317 }
318 }
319 };
320}
321
322define_element_type!();
323
324/// A trait for types that can represent an element's structural role.
325pub trait ElementRole: Copy + Eq + Send {
326 /// Maps this role to a universal, language-agnostic role.
327 fn universal(&self) -> UniversalElementRole;
328
329 /// Returns a specific name for this role, used for granular highlighting.
330 fn name(&self) -> &str;
331}
332
333/// Represents the general structural role of a syntax tree element.
334///
335/// Elements are categorized by their role to enable generic analysis across diverse
336/// language families. For example, highlighters and symbol extractors can use these
337/// roles to identify definitions, references, or containers without knowing the
338/// specific grammar of each language.
339///
340/// # Structural Groups
341///
342/// - **Flow Control**: [`UniversalElementRole::Statement`], [`UniversalElementRole::Expression`], [`UniversalElementRole::Call`], and [`UniversalElementRole::Root`].
343/// - **Symbol Management**: [`UniversalElementRole::Definition`], [`UniversalElementRole::Binding`], and [`UniversalElementRole::Reference`].
344/// - **Hierarchy**: [`UniversalElementRole::Container`].
345/// - **Metadata**: [`UniversalElementRole::Typing`], [`UniversalElementRole::Metadata`], [`UniversalElementRole::Attribute`], [`UniversalElementRole::Documentation`].
346///
347/// When a node could fit multiple roles, choose the one that represents its **primary
348/// structural intent**.
349#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
350#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
351#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))]
352#[non_exhaustive]
353pub enum UniversalElementRole {
354 /// The top-level root of the syntax tree, representing the entire document or source file.
355 Root,
356
357 /// A high-level structural container that defines a scope or logical grouping.
358 Container,
359
360 /// A node that represents the entire declaration or definition of a symbol.
361 ///
362 /// This role identifies the "whole" entity that defines something in the code,
363 /// which is crucial for building symbol trees and navigation outlines.
364 ///
365 /// # Examples
366 /// - **Rust**: The entire `Fn` declaration block, `Struct` item, or `Enum`.
367 /// - **Markdown**: `Heading` or `LinkDefinition`.
368 /// - **SQL**: The whole `CREATE TABLE` or `CREATE PROCEDURE` statement.
369 /// - **ASM**: A `Proc` (procedure) block or a multi-line data definition.
370 /// - **YAML**: A schema-defined object or a complex configuration block.
371 Definition,
372
373 /// A node that specifically performs the act of binding a name to an entity.
374 ///
375 /// Unlike `Definition`, which represents the entire construct, `Binding` targets
376 /// the specific part (usually the identifier) that introduces the name.
377 ///
378 /// # Examples
379 /// - **Rust**: The identifier node in a `let` pattern or function name.
380 /// - **Markdown**: `LinkLabel` in a reference link definition.
381 /// - **SQL**: The `Table` name identifier in `CREATE TABLE`.
382 /// - **ASM**: A `Label` node (e.g., `main:`).
383 /// - **YAML**: The `Key` in a key-value mapping.
384 Binding,
385
386 /// A node that refers to an existing name or entity defined elsewhere.
387 ///
388 /// # Examples
389 /// - **Rust**: `PathExpr` (variable usage) or `MethodCall`.
390 /// - **Markdown**: `LinkReference` or `FootnoteReference`.
391 /// - **SQL**: `ColumnName` in a `SELECT` clause or `TableName` in `FROM`.
392 /// - **ASM**: A `Label` reference in a jump (e.g., `JMP main`).
393 /// - **YAML**: An `Alias` anchor (e.g., `*anchor_name`).
394 Reference,
395
396 /// A node representing a type signature, constraint, or type reference.
397 ///
398 /// This role distinguishes type information from general logic or values,
399 /// which is essential for type checking and intelligent completion.
400 ///
401 /// # Examples
402 /// - **Rust**: `TypePath` (e.g., `: i32`), `GenericArgument`, or `WhereClause`.
403 /// - **SQL**: `DataType` (e.g., `VARCHAR(255)` or `INT`).
404 /// - **ASM**: Size specifiers (e.g., `DWORD`, `PTR`).
405 /// - **TypeScript**: `TypeAnnotation` or `InterfaceDeclaration`.
406 Typing,
407
408 /// Structured comments or documentation nodes attached to other elements.
409 ///
410 /// Unlike raw `Comment` tokens, these are syntax nodes that may contain
411 /// their own internal structure (like Markdown or Tagged parameters).
412 ///
413 /// # Examples
414 /// - **Rust**: `DocComment` (e.g., `/// ...`).
415 /// - **Java**: `Javadoc` blocks.
416 /// - **Python**: `Docstring` literals.
417 Documentation,
418
419 /// High-level annotations, decorators, or macros that provide extra semantic info.
420 ///
421 /// # Metadata vs Attribute
422 /// - **Metadata**: Usually refers to language-level extensions that "decorate" an element
423 /// from the outside, often affecting compilation or runtime behavior (e.g., Rust attributes).
424 /// - **Attribute**: Usually refers to built-in, structural properties that are part of the
425 /// element's native definition (e.g., HTML attributes).
426 ///
427 /// # Examples
428 /// - **Rust**: `Attribute` (e.g., `#[derive(...)]`) or `MacroCall`.
429 /// - **Markdown**: `Frontmatter` (YAML/TOML header).
430 /// - **Java/TS**: `@Decorator` or `@Annotation`.
431 /// - **Python**: `@decorator` syntax.
432 Metadata,
433
434 /// A specific property, flag, or attribute-value pair.
435 ///
436 /// Unlike `Metadata`, which decorates an element with external logic, `Attribute`
437 /// represents intrinsic properties defined by the language's schema or structure.
438 ///
439 /// # Examples
440 /// - **HTML/XML**: An `Attribute` (e.g., `id="main"`).
441 /// - **Markdown**: `LinkTitle` or `ImageAlt` text.
442 /// - **YAML**: A specific configuration property.
443 /// - **ASM**: Segment attributes (e.g., `READONLY`, `EXECUTE`).
444 Attribute,
445
446 /// The key part of an attribute, property, or configuration entry.
447 ///
448 /// This role is distinct because:
449 /// - It is not a **Reference** (it doesn't refer to an external symbol).
450 /// - It is not a traditional **Binding** (it doesn't define a symbol in a global or lexical scope).
451 /// - It is not a **Keyword** (it is typically a user-defined or schema-defined identifier).
452 ///
453 /// # Examples
454 /// - **HTML**: The `id` in `id="main"`.
455 /// - **Markdown**: `AttributeName` (in Pandoc-style `{ #id .class };`).
456 /// - **YAML**: The key in a property mapping.
457 /// - **TOML**: The key in a table entry.
458 AttributeKey,
459
460 /// A node that provides additional details or secondary information for another element.
461 ///
462 /// # Examples
463 /// - **Rust**: `GenericParameter` list, `FunctionParameter` list.
464 /// - **SQL**: `Constraint` details.
465 Detail,
466
467 /// A node that represents the name of an element, typically used in declarations.
468 ///
469 /// # Examples
470 /// - **Rust**: The name identifier in a function or struct definition.
471 /// - **HTML**: The tag name in an element.
472 Name,
473
474 /// A discrete syntactic unit within a container, representing a single
475 /// logical entry or instruction.
476 ///
477 /// This typically maps to a **Statement** in programming languages, or a standalone
478 /// instruction in assembly. In markup, it could represent a list item or a table row.
479 ///
480 /// # Examples
481 /// - **Rust**: A `Stmt` inside a block.
482 /// - **Markdown**: `ListItem` or `TableCell`.
483 /// - **SQL**: A standalone `Statement` or a `Clause` (like `WHERE`).
484 /// - **ASM**: A single `Instruction` (e.g., `NOP`).
485 Statement,
486
487 /// A node representing a computed result or a complex logical operation.
488 ///
489 /// Unlike a simple `Value` (which is an atomic literal), an `Expression` involves
490 /// operators or logic that must be evaluated.
491 ///
492 /// # Examples
493 /// - **Rust**: `BinaryExpr`, `UnaryExpr`, or `RangeExpr`.
494 /// - **SQL**: `BinaryOp` in a `WHERE` clause.
495 /// - **Python**: `ListComprehension` or `Lambda`.
496 Expression,
497
498 /// A node that performs an invocation or call to a function, method, or macro.
499 ///
500 /// This role identifies the active execution of a named entity with optional arguments.
501 ///
502 /// # Examples
503 /// - **Rust**: `CallExpr`, `MethodCallExpr`, or `MacroInvocation`.
504 /// - **SQL**: `FunctionCall` (e.g., `COUNT(*)`).
505 /// - **Excel**: A formula call.
506 Call,
507
508 /// A node representing an **atomic** data value or a primitive constant.
509 ///
510 /// This role is strictly for atomic values like numbers, strings, or booleans.
511 /// It **does not** include composite structures like arrays `[]` or objects `{}`,
512 /// which should be categorized as [`UniversalElementRole::Container`].
513 ///
514 /// # Examples
515 /// - **Rust**: `Literal` (strings, numbers, booleans).
516 /// - **Markdown**: `InlineCode`, `Emphasis`, or `Strong`.
517 /// - **SQL**: `Literal` values.
518 /// - **JSON/YAML**: Atomic `Scalar` values (strings, integers, nulls).
519 Value,
520
521 /// A node that acts as a host for content in a different language or a raw
522 /// fragment requiring a separate parsing pass (Language Injection).
523 ///
524 /// # Examples
525 /// - **HTML**: A `<script>` or `<style>` block containing JS/CSS.
526 /// - **Markdown**: `CodeBlock` (host for other languages).
527 /// - **Rust/Java**: A string literal containing SQL (if marked for injection).
528 /// - **PHP**: Raw HTML fragments outside of `<?php ... ?>` tags.
529 Embedded,
530
531 /// A node specifically created to represent a syntax error or recovery point
532 /// in the source code.
533 Error,
534
535 /// No specific structural role assigned or recognized for this element.
536 None,
537}
538
539impl ElementRole for UniversalElementRole {
540 fn universal(&self) -> UniversalElementRole {
541 *self
542 }
543
544 fn name(&self) -> &str {
545 match *self {
546 UniversalElementRole::Container => "meta.block",
547 UniversalElementRole::Statement => "meta.statement",
548 UniversalElementRole::Binding => "variable.other.declaration",
549 UniversalElementRole::Reference => "variable.other.usage",
550 UniversalElementRole::Call => "entity.name.function.call",
551 UniversalElementRole::Expression => "meta.expression",
552 UniversalElementRole::Value => "constant",
553 UniversalElementRole::Definition => "entity.name.function",
554 UniversalElementRole::Typing => "entity.name.type",
555 UniversalElementRole::Metadata => "meta.preprocessor",
556 UniversalElementRole::Attribute => "entity.other.attribute-name",
557 UniversalElementRole::AttributeKey => "entity.other.attribute-name.key",
558 UniversalElementRole::Detail => "meta.detail",
559 UniversalElementRole::Name => "entity.name",
560 UniversalElementRole::Embedded => "meta.embedded",
561 UniversalElementRole::Documentation => "comment.block.documentation",
562 UniversalElementRole::Root => "source",
563 UniversalElementRole::Error => "invalid",
564 UniversalElementRole::None => "none",
565 }
566 }
567}