Skip to main content

oak_core/language/
mod.rs

1use std::{fmt::Debug, hash::Hash};
2
3/// Represents the broad category a language belongs to.
4///
5/// Categories are used by the framework to apply default behaviors,
6/// such as choosing appropriate lexer/parser strategies or selecting
7/// default themes for syntax highlighting.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
9#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
10pub enum LanguageCategory {
11    /// General-purpose programming languages (e.g., Rust, C, Java).
12    Programming,
13    /// Markup and document languages (e.g., Markdown, HTML, Typst).
14    Markup,
15    /// Configuration and data serialization languages (e.g., YAML, JSON, TOML).
16    Config,
17    /// Styling languages (e.g., CSS, Sass, Less).
18    StyleSheet,
19    /// Domain-specific languages or specialized notation (e.g., SQL, Regex, Math).
20    Dsl,
21    /// Modeling languages (e.g., UML, Mermaid, PlantUML).
22    Modeling,
23    /// Other or unclassified.
24    Other,
25}
26
27/// Language definition trait that coordinates all language-related types and behaviors.
28///
29/// This trait ties together language-specific components like lexers, parsers, and ASTs.
30/// It enables the framework to be language-agnostic while providing type-safe
31/// language-specific functionality.
32pub trait Language: Send + Sync {
33    /// The name of the language (e.g., "rust", "sql").
34    const NAME: &'static str;
35
36    /// The category of the language.
37    const CATEGORY: LanguageCategory = LanguageCategory::Programming;
38
39    /// The token type used to represent different token and node types in the language.
40    ///
41    /// This associated type defines how different syntactic elements (tokens, nodes) are
42    /// categorized and identified within the language. It must implement `Copy` and `Eq`
43    /// to ensure efficient handling in the parsing system.
44    ///
45    /// # Requirements
46    ///
47    /// The token type must:
48    /// - Implement the `TokenType` trait
49    /// - Be copyable to enable efficient passing
50    /// - Support equality comparison for token matching
51    /// - Be sendable across thread boundaries
52    ///
53    /// # Examples
54    ///
55    /// ```
56    /// #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
57    /// enum RustSyntaxKind {
58    ///     LetKeyword,
59    ///     Identifier,
60    ///     Number,
61    ///     // ... other token kinds
62    /// }
63    /// ```
64    type TokenType: TokenType;
65
66    /// The element type used to represent composite structures in the parsed tree.
67    ///
68    /// While tokens represent the atomic units of the language, elements represent
69    /// the composite structures formed by combining tokens according to grammar rules.
70    /// This includes expressions, statements, declarations, and other syntactic constructs.
71    ///
72    /// # Requirements
73    ///
74    /// The element type must:
75    /// - Implement the `ElementType` trait
76    /// - Be copyable for efficient handling
77    /// - Support equality comparison
78    /// - Be sendable across thread boundaries
79    type ElementType: ElementType;
80
81    /// The root type for the parsed tree that represents the top-level structure of the language.
82    ///
83    /// This associated type defines the structure of the root node in the parsed tree,
84    /// which typically contains the entire parsed source code organized according to the
85    /// language's grammar rules. The root type serves as the entry point for traversing
86    /// and manipulating the parsed representation.
87    ///
88    /// # Design Considerations
89    ///
90    /// The root type should:
91    /// - Contain references to all top-level language constructs
92    /// - Provide efficient access to the parsed content
93    /// - Support incremental updates when the source changes
94    ///
95    /// # Examples
96    ///
97    /// ```ignore
98    /// struct RustRoot {
99    ///     items: Vec<RustItem>,
100    /// }
101    ///
102    /// struct RustRoot {
103    ///     modules: Vec<Module>,
104    ///     imports: Vec<Import>,
105    ///     declarations: Vec<Declaration>,
106    /// }
107    /// ```
108    type TypedRoot;
109}
110
111/// Token type definitions for tokens in the parsing system.
112///
113/// This trait serves as the foundation for defining different types of tokens.
114/// By mapping language-specific kinds to [`UniversalTokenRole`], generic tools
115/// like highlighters and formatters can work across many languages.
116macro_rules! define_token_type {
117    ($($bound:tt)*) => {
118        /// A trait for types that represent a token's kind in a specific language.
119        pub trait TokenType: Copy + Eq + Hash + Send + Sync + std::fmt::Debug $($bound)* {
120            /// The associated role type for this token kind.
121            type Role: TokenRole;
122
123            /// A constant representing the end of the input stream.
124            const END_OF_STREAM: Self;
125
126            /// Returns the general syntactic role of this token.
127    ///
128    /// The role determines how the token is treated by generic language tools.
129    /// For example, tokens with [`UniversalTokenRole::Keyword`] are typically
130    /// highlighted using a specific theme color, regardless of the language.
131    fn role(&self) -> Self::Role;
132
133            /// Returns true if this token matches the specified language-specific role.
134            fn is_role(&self, role: Self::Role) -> bool {
135                self.role() == role
136            }
137
138            /// Returns true if this token matches the specified universal role.
139            fn is_universal(&self, role: UniversalTokenRole) -> bool {
140                self.role().universal() == role
141            }
142
143            /// Returns true if this token represents a comment.
144            fn is_comment(&self) -> bool {
145                self.is_universal(UniversalTokenRole::Comment)
146            }
147
148            /// Returns true if this token represents whitespace.
149            fn is_whitespace(&self) -> bool {
150                self.is_universal(UniversalTokenRole::Whitespace)
151            }
152
153            /// Returns true if this token represents an error condition.
154            fn is_error(&self) -> bool {
155                self.is_universal(UniversalTokenRole::Error)
156            }
157
158            /// Returns true if this token represents trivia (whitespace, comments, etc.).
159            fn is_ignored(&self) -> bool {
160                self.is_whitespace() || self.is_comment()
161            }
162
163            /// Returns true if this token represents the end of the input stream.
164            fn is_end_of_stream(&self) -> bool {
165                *self == Self::END_OF_STREAM
166            }
167        }
168    };
169}
170
171define_token_type!();
172
173/// A trait for types that can represent a token's syntactic role.
174pub trait TokenRole: Copy + Eq + Send {
175    /// Maps this role to a universal, language-agnostic role.
176    fn universal(&self) -> UniversalTokenRole;
177
178    /// Returns a specific name for this role, used for granular highlighting.
179    ///
180    /// For universal roles, this should return the standard scope name (e.g., "keyword").
181    /// For language-specific roles, it can return more specific names (e.g., "keyword.control").
182    fn name(&self) -> &str;
183}
184
185/// Represents the general syntactic role of a token across diverse languages.
186///
187/// By mapping to these roles, generic tools can identify names, literals, or operators
188/// across 100+ languages without needing to learn the specifics of each grammar.
189#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
190#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
191#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))]
192pub enum UniversalTokenRole {
193    /// Language reserved words or built-in commands (e.g., 'SELECT', 'let', 'MOV').
194    Keyword,
195    /// Identifiers, labels, keys, tags, or any name-like token.
196    Name,
197    /// Literal values like strings, numbers, booleans, or nulls.
198    Literal,
199    /// An escape sequence or a special character representation within a literal.
200    Escape,
201    /// Mathematical, logical, or structural operators (e.g., '+', '=>', 'LIKE').
202    Operator,
203    /// Structural characters like brackets, commas, semicolons.
204    Punctuation,
205    /// Developer annotations or documentation.
206    Comment,
207    /// Formatting characters like spaces or tabs.
208    Whitespace,
209    /// Malformed or unrecognized content.
210    Error,
211    /// No specific role assigned.
212    None,
213    /// End of stream marker.
214    Eof,
215}
216
217impl TokenRole for UniversalTokenRole {
218    fn universal(&self) -> UniversalTokenRole {
219        *self
220    }
221
222    fn name(&self) -> &str {
223        match *self {
224            UniversalTokenRole::Keyword => "keyword",
225            UniversalTokenRole::Name => "variable.other",
226            UniversalTokenRole::Literal => "constant",
227            UniversalTokenRole::Escape => "constant.character.escape",
228            UniversalTokenRole::Operator => "keyword.operator",
229            UniversalTokenRole::Punctuation => "punctuation",
230            UniversalTokenRole::Comment => "comment",
231            UniversalTokenRole::Whitespace => "punctuation.whitespace",
232            UniversalTokenRole::Error => "invalid",
233            UniversalTokenRole::None => "none",
234            UniversalTokenRole::Eof => "punctuation.eof",
235        }
236    }
237}
238
239/// Element type definitions for nodes in the parsed tree.
240///
241/// While tokens represent the atomic units of a language, elements represent the
242/// composite structures formed by combining tokens according to grammar rules.
243/// This includes expressions, statements, declarations, and other syntactic constructs.
244///
245/// # Implementation Guidelines
246///
247/// When implementing this trait for a specific language:
248/// - Use an enum with discriminant values for efficient matching
249/// - Include a Root variant to identify the top-level element
250/// - Include an Error variant for malformed constructs
251/// - Define a `Role` associated type and implement the `role()` method.
252///
253/// # Examples
254///
255/// ```ignore
256/// #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
257/// enum MyElement {
258///     Root,
259///     FunctionDeclaration,
260///     Block,
261///     Error,
262/// }
263///
264/// impl ElementType for MyElement {
265///     type Role = UniversalElementRole;
266///
267///     fn role(&self) -> Self::Role {
268///         match self {
269///             MyElement::Root => UniversalElementRole::Root,
270///             MyElement::FunctionDeclaration => UniversalElementRole::Binding,
271///             MyElement::Block => UniversalElementRole::Container,
272///             MyElement::Error => UniversalElementRole::Error,
273///         }
274///     }
275///
276///     fn is_root(&self) -> bool {
277///         matches!(self, MyElement::Root)
278///     }
279///
280///     fn is_error(&self) -> bool {
281///         matches!(self, MyElement::Error)
282///     }
283/// }
284/// ```
285macro_rules! define_element_type {
286    ($($bound:tt)*) => {
287        /// A trait for types that represent an element's kind in a syntax tree.
288        pub trait ElementType: Copy + Eq + Hash + Send + Sync + std::fmt::Debug $($bound)* {
289            /// The associated role type for this element kind.
290            type Role: ElementRole;
291
292            /// Returns the general syntactic role of this element.
293    ///
294    /// The role enables structural analysis without knowing the specific grammar.
295    /// For example, a tool can find all [`UniversalElementRole::Definition`] nodes
296    /// to build a symbol outline for any supported language.
297    fn role(&self) -> Self::Role;
298
299            /// Returns true if this element matches the specified language-specific role.
300            fn is_role(&self, role: Self::Role) -> bool {
301                self.role() == role
302            }
303
304            /// Returns true if this element matches the specified universal role.
305            fn is_universal(&self, role: UniversalElementRole) -> bool {
306                self.role().universal() == role
307            }
308
309            /// Returns true if this element represents the root of the parsed tree.
310            fn is_root(&self) -> bool {
311                self.is_universal(UniversalElementRole::Root)
312            }
313
314            /// Returns true if this element represents an error condition.
315            fn is_error(&self) -> bool {
316                self.is_universal(UniversalElementRole::Error)
317            }
318        }
319    };
320}
321
322define_element_type!();
323
324/// A trait for types that can represent an element's structural role.
325pub trait ElementRole: Copy + Eq + Send {
326    /// Maps this role to a universal, language-agnostic role.
327    fn universal(&self) -> UniversalElementRole;
328
329    /// Returns a specific name for this role, used for granular highlighting.
330    fn name(&self) -> &str;
331}
332
333/// Represents the general structural role of a syntax tree element.
334///
335/// Elements are categorized by their role to enable generic analysis across diverse
336/// language families. For example, highlighters and symbol extractors can use these
337/// roles to identify definitions, references, or containers without knowing the
338/// specific grammar of each language.
339///
340/// # Structural Groups
341///
342/// - **Flow Control**: [`UniversalElementRole::Statement`], [`UniversalElementRole::Expression`], [`UniversalElementRole::Call`], and [`UniversalElementRole::Root`].
343/// - **Symbol Management**: [`UniversalElementRole::Definition`], [`UniversalElementRole::Binding`], and [`UniversalElementRole::Reference`].
344/// - **Hierarchy**: [`UniversalElementRole::Container`].
345/// - **Metadata**: [`UniversalElementRole::Typing`], [`UniversalElementRole::Metadata`], [`UniversalElementRole::Attribute`], [`UniversalElementRole::Documentation`].
346///
347/// When a node could fit multiple roles, choose the one that represents its **primary
348/// structural intent**.
349#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
350#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
351#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))]
352#[non_exhaustive]
353pub enum UniversalElementRole {
354    /// The top-level root of the syntax tree, representing the entire document or source file.
355    Root,
356
357    /// A high-level structural container that defines a scope or logical grouping.
358    Container,
359
360    /// A node that represents the entire declaration or definition of a symbol.
361    ///
362    /// This role identifies the "whole" entity that defines something in the code,
363    /// which is crucial for building symbol trees and navigation outlines.
364    ///
365    /// # Examples
366    /// - **Rust**: The entire `Fn` declaration block, `Struct` item, or `Enum`.
367    /// - **Markdown**: `Heading` or `LinkDefinition`.
368    /// - **SQL**: The whole `CREATE TABLE` or `CREATE PROCEDURE` statement.
369    /// - **ASM**: A `Proc` (procedure) block or a multi-line data definition.
370    /// - **YAML**: A schema-defined object or a complex configuration block.
371    Definition,
372
373    /// A node that specifically performs the act of binding a name to an entity.
374    ///
375    /// Unlike `Definition`, which represents the entire construct, `Binding` targets
376    /// the specific part (usually the identifier) that introduces the name.
377    ///
378    /// # Examples
379    /// - **Rust**: The identifier node in a `let` pattern or function name.
380    /// - **Markdown**: `LinkLabel` in a reference link definition.
381    /// - **SQL**: The `Table` name identifier in `CREATE TABLE`.
382    /// - **ASM**: A `Label` node (e.g., `main:`).
383    /// - **YAML**: The `Key` in a key-value mapping.
384    Binding,
385
386    /// A node that refers to an existing name or entity defined elsewhere.
387    ///
388    /// # Examples
389    /// - **Rust**: `PathExpr` (variable usage) or `MethodCall`.
390    /// - **Markdown**: `LinkReference` or `FootnoteReference`.
391    /// - **SQL**: `ColumnName` in a `SELECT` clause or `TableName` in `FROM`.
392    /// - **ASM**: A `Label` reference in a jump (e.g., `JMP main`).
393    /// - **YAML**: An `Alias` anchor (e.g., `*anchor_name`).
394    Reference,
395
396    /// A node representing a type signature, constraint, or type reference.
397    ///
398    /// This role distinguishes type information from general logic or values,
399    /// which is essential for type checking and intelligent completion.
400    ///
401    /// # Examples
402    /// - **Rust**: `TypePath` (e.g., `: i32`), `GenericArgument`, or `WhereClause`.
403    /// - **SQL**: `DataType` (e.g., `VARCHAR(255)` or `INT`).
404    /// - **ASM**: Size specifiers (e.g., `DWORD`, `PTR`).
405    /// - **TypeScript**: `TypeAnnotation` or `InterfaceDeclaration`.
406    Typing,
407
408    /// Structured comments or documentation nodes attached to other elements.
409    ///
410    /// Unlike raw `Comment` tokens, these are syntax nodes that may contain
411    /// their own internal structure (like Markdown or Tagged parameters).
412    ///
413    /// # Examples
414    /// - **Rust**: `DocComment` (e.g., `/// ...`).
415    /// - **Java**: `Javadoc` blocks.
416    /// - **Python**: `Docstring` literals.
417    Documentation,
418
419    /// High-level annotations, decorators, or macros that provide extra semantic info.
420    ///
421    /// # Metadata vs Attribute
422    /// - **Metadata**: Usually refers to language-level extensions that "decorate" an element
423    ///   from the outside, often affecting compilation or runtime behavior (e.g., Rust attributes).
424    /// - **Attribute**: Usually refers to built-in, structural properties that are part of the
425    ///   element's native definition (e.g., HTML attributes).
426    ///
427    /// # Examples
428    /// - **Rust**: `Attribute` (e.g., `#[derive(...)]`) or `MacroCall`.
429    /// - **Markdown**: `Frontmatter` (YAML/TOML header).
430    /// - **Java/TS**: `@Decorator` or `@Annotation`.
431    /// - **Python**: `@decorator` syntax.
432    Metadata,
433
434    /// A specific property, flag, or attribute-value pair.
435    ///
436    /// Unlike `Metadata`, which decorates an element with external logic, `Attribute`
437    /// represents intrinsic properties defined by the language's schema or structure.
438    ///
439    /// # Examples
440    /// - **HTML/XML**: An `Attribute` (e.g., `id="main"`).
441    /// - **Markdown**: `LinkTitle` or `ImageAlt` text.
442    /// - **YAML**: A specific configuration property.
443    /// - **ASM**: Segment attributes (e.g., `READONLY`, `EXECUTE`).
444    Attribute,
445
446    /// The key part of an attribute, property, or configuration entry.
447    ///
448    /// This role is distinct because:
449    /// - It is not a **Reference** (it doesn't refer to an external symbol).
450    /// - It is not a traditional **Binding** (it doesn't define a symbol in a global or lexical scope).
451    /// - It is not a **Keyword** (it is typically a user-defined or schema-defined identifier).
452    ///
453    /// # Examples
454    /// - **HTML**: The `id` in `id="main"`.
455    /// - **Markdown**: `AttributeName` (in Pandoc-style `{ #id .class };`).
456    /// - **YAML**: The key in a property mapping.
457    /// - **TOML**: The key in a table entry.
458    AttributeKey,
459
460    /// A node that provides additional details or secondary information for another element.
461    ///
462    /// # Examples
463    /// - **Rust**: `GenericParameter` list, `FunctionParameter` list.
464    /// - **SQL**: `Constraint` details.
465    Detail,
466
467    /// A node that represents the name of an element, typically used in declarations.
468    ///
469    /// # Examples
470    /// - **Rust**: The name identifier in a function or struct definition.
471    /// - **HTML**: The tag name in an element.
472    Name,
473
474    /// A discrete syntactic unit within a container, representing a single
475    /// logical entry or instruction.
476    ///
477    /// This typically maps to a **Statement** in programming languages, or a standalone
478    /// instruction in assembly. In markup, it could represent a list item or a table row.
479    ///
480    /// # Examples
481    /// - **Rust**: A `Stmt` inside a block.
482    /// - **Markdown**: `ListItem` or `TableCell`.
483    /// - **SQL**: A standalone `Statement` or a `Clause` (like `WHERE`).
484    /// - **ASM**: A single `Instruction` (e.g., `NOP`).
485    Statement,
486
487    /// A node representing a computed result or a complex logical operation.
488    ///
489    /// Unlike a simple `Value` (which is an atomic literal), an `Expression` involves
490    /// operators or logic that must be evaluated.
491    ///
492    /// # Examples
493    /// - **Rust**: `BinaryExpr`, `UnaryExpr`, or `RangeExpr`.
494    /// - **SQL**: `BinaryOp` in a `WHERE` clause.
495    /// - **Python**: `ListComprehension` or `Lambda`.
496    Expression,
497
498    /// A node that performs an invocation or call to a function, method, or macro.
499    ///
500    /// This role identifies the active execution of a named entity with optional arguments.
501    ///
502    /// # Examples
503    /// - **Rust**: `CallExpr`, `MethodCallExpr`, or `MacroInvocation`.
504    /// - **SQL**: `FunctionCall` (e.g., `COUNT(*)`).
505    /// - **Excel**: A formula call.
506    Call,
507
508    /// A node representing an **atomic** data value or a primitive constant.
509    ///
510    /// This role is strictly for atomic values like numbers, strings, or booleans.
511    /// It **does not** include composite structures like arrays `[]` or objects `{}`,
512    /// which should be categorized as [`UniversalElementRole::Container`].
513    ///
514    /// # Examples
515    /// - **Rust**: `Literal` (strings, numbers, booleans).
516    /// - **Markdown**: `InlineCode`, `Emphasis`, or `Strong`.
517    /// - **SQL**: `Literal` values.
518    /// - **JSON/YAML**: Atomic `Scalar` values (strings, integers, nulls).
519    Value,
520
521    /// A node that acts as a host for content in a different language or a raw
522    /// fragment requiring a separate parsing pass (Language Injection).
523    ///
524    /// # Examples
525    /// - **HTML**: A `<script>` or `<style>` block containing JS/CSS.
526    /// - **Markdown**: `CodeBlock` (host for other languages).
527    /// - **Rust/Java**: A string literal containing SQL (if marked for injection).
528    /// - **PHP**: Raw HTML fragments outside of `<?php ... ?>` tags.
529    Embedded,
530
531    /// A node specifically created to represent a syntax error or recovery point
532    /// in the source code.
533    Error,
534
535    /// No specific structural role assigned or recognized for this element.
536    None,
537}
538
539impl ElementRole for UniversalElementRole {
540    fn universal(&self) -> UniversalElementRole {
541        *self
542    }
543
544    fn name(&self) -> &str {
545        match *self {
546            UniversalElementRole::Container => "meta.block",
547            UniversalElementRole::Statement => "meta.statement",
548            UniversalElementRole::Binding => "variable.other.declaration",
549            UniversalElementRole::Reference => "variable.other.usage",
550            UniversalElementRole::Call => "entity.name.function.call",
551            UniversalElementRole::Expression => "meta.expression",
552            UniversalElementRole::Value => "constant",
553            UniversalElementRole::Definition => "entity.name.function",
554            UniversalElementRole::Typing => "entity.name.type",
555            UniversalElementRole::Metadata => "meta.preprocessor",
556            UniversalElementRole::Attribute => "entity.other.attribute-name",
557            UniversalElementRole::AttributeKey => "entity.other.attribute-name.key",
558            UniversalElementRole::Detail => "meta.detail",
559            UniversalElementRole::Name => "entity.name",
560            UniversalElementRole::Embedded => "meta.embedded",
561            UniversalElementRole::Documentation => "comment.block.documentation",
562            UniversalElementRole::Root => "source",
563            UniversalElementRole::Error => "invalid",
564            UniversalElementRole::None => "none",
565        }
566    }
567}