xgrammar/compiler/
grammar_compiler.rs

1use autocxx::prelude::*;
2
3use crate::{
4    CxxUniquePtr,
5    FFIGrammarCompiler,
6    compiler::CompiledGrammar,
7    cxx_int,
8    cxx_longlong,
9    cxx_utils,
10    grammar::{self, StructuralTagItem},
11    tokenizer_info::TokenizerInfo,
12};
13
14/// The compiler for grammars. It is associated with a certain tokenizer info, and compiles
15/// grammars into `CompiledGrammar` with the tokenizer info. It allows parallel compilation with
16/// multiple threads, and has a cache to store the compilation result, avoiding compiling the
17/// same grammar multiple times.
18pub struct GrammarCompiler {
19    inner: CxxUniquePtr<FFIGrammarCompiler>,
20}
21
22impl GrammarCompiler {
23    /// Construct the compiler.
24    ///
25    /// Parameters
26    /// - `tokenizer_info`: The tokenizer info.
27    /// - `max_threads` (default: 8): The maximum number of threads used to compile the grammar.
28    /// - `cache_enabled` (default: true): Whether to enable the cache.
29    /// - `cache_limit_bytes` (default: -1): The maximum memory usage for the cache in bytes.
30    ///   Note that the actual memory usage may slightly exceed this value.
31    pub fn new(
32        tokenizer_info: &TokenizerInfo,
33        max_threads: i32,
34        cache_enabled: bool,
35        cache_limit_bytes: isize,
36    ) -> Self {
37        let inner = cxx_utils::make_grammar_compiler(
38            tokenizer_info.ffi_ref(),
39            cxx_int(max_threads),
40            cache_enabled,
41            cxx_longlong(cache_limit_bytes as i64),
42        );
43        Self { inner }
44    }
45
46    /// Get `CompiledGrammar` from the specified JSON schema and format. The indent
47    /// and separators parameters follow the same convention as in serde_json's pretty printing
48    /// (mirroring Python's json.dumps()).
49    ///
50    /// Parameters
51    /// - `schema`: The schema string.
52    /// - `any_whitespace`: Whether to allow any whitespace regardless of indent/separators.
53    /// - `indent`: The number of spaces for indentation. If None, the output will be in one line.
54    /// - `separators`: Two separators used in the schema: comma and colon. Examples: (",", ":"),
55    ///   (", ", ": "). If None, defaults to (",", ": ") when indent is Some, otherwise
56    ///   (", ", ": ").
57    /// - `strict_mode`: Whether to use strict mode. In strict mode, the generated grammar will not
58    ///   allow properties and items that are not specified in the schema. This is equivalent to
59    ///   setting unevaluatedProperties and unevaluatedItems to false.
60    pub fn compile_json_schema(
61        &mut self,
62        schema: &str,
63        any_whitespace: bool,
64        indent: Option<i32>,
65        separators: Option<(impl AsRef<str>, impl AsRef<str>)>,
66        strict_mode: bool,
67        max_whitespace_cnt: Option<i32>,
68    ) -> CompiledGrammar {
69        cxx::let_cxx_string!(schema_cxx = schema);
70        let has_indent = indent.is_some();
71        let indent_i32: i32 = indent.unwrap_or(0);
72        let has_separators = separators.is_some();
73        let (sep_comma, sep_colon) = if let Some((comma, colon)) = separators {
74            (comma.as_ref().to_string(), colon.as_ref().to_string())
75        } else {
76            (String::new(), String::new())
77        };
78        cxx::let_cxx_string!(sep_comma_cxx = sep_comma.as_str());
79        cxx::let_cxx_string!(sep_colon_cxx = sep_colon.as_str());
80
81        let unique_ptr = cxx_utils::compiler_compile_json_schema(
82            self.inner.as_mut().expect("GrammarCompiler inner is null"),
83            &schema_cxx,
84            any_whitespace,
85            has_indent,
86            cxx_int(indent_i32),
87            has_separators,
88            &sep_comma_cxx,
89            &sep_colon_cxx,
90            strict_mode,
91            max_whitespace_cnt.is_some(),
92            cxx_int(max_whitespace_cnt.unwrap_or(0)),
93        );
94        CompiledGrammar::from_unique_ptr(unique_ptr)
95    }
96
97    /// Get `CompiledGrammar` from the standard JSON.
98    pub fn compile_builtin_json_grammar(&mut self) -> CompiledGrammar {
99        let unique_ptr = cxx_utils::compiler_compile_builtin_json(
100            self.inner.as_mut().expect("GrammarCompiler inner is null"),
101        );
102        CompiledGrammar::from_unique_ptr(unique_ptr)
103    }
104
105    /// Get `CompiledGrammar` from the specified regex.
106    pub fn compile_regex(
107        &mut self,
108        regex: &str,
109    ) -> CompiledGrammar {
110        cxx::let_cxx_string!(regex_cxx = regex);
111        let unique_ptr = cxx_utils::compiler_compile_regex(
112            self.inner.as_mut().expect("GrammarCompiler inner is null"),
113            &regex_cxx,
114        );
115        CompiledGrammar::from_unique_ptr(unique_ptr)
116    }
117
118    /// Compile a grammar from structural tags.
119    ///
120    /// Parameters
121    /// - `tags`: The structural tags.
122    /// - `triggers`: The triggers. Each trigger should be a prefix of a provided begin tag.
123    pub fn compile_structural_tag(
124        &mut self,
125        tags: &[StructuralTagItem],
126        triggers: &[impl AsRef<str>],
127    ) -> CompiledGrammar {
128        // Build StructuralTag JSON: {"type":"structural_tag","format":{...}}
129        use serde_json::json;
130        let mut tag_entries = Vec::new();
131        for tag in tags {
132            let schema_value: serde_json::Value =
133                serde_json::from_str(&tag.schema)
134                    .expect("Invalid JSON schema in StructuralTagItem");
135            let content = json!({
136                "type": "json_schema",
137                "json_schema": schema_value
138            });
139            tag_entries.push(json!({
140                "type": "tag",
141                "begin": tag.begin,
142                "content": content,
143                "end": tag.end,
144            }));
145        }
146        let triggers_vec: Vec<String> =
147            triggers.iter().map(|t| t.as_ref().to_string()).collect();
148        let format_obj = json!({
149            "type": "triggered_tags",
150            "triggers": triggers_vec,
151            "tags": tag_entries,
152        });
153        let structural_tag_json = json!({
154            "type": "structural_tag",
155            "format": format_obj,
156        })
157        .to_string();
158
159        cxx::let_cxx_string!(structural_tag_str = structural_tag_json);
160        let unique_ptr = cxx_utils::compiler_compile_structural_tag(
161            self.inner.as_mut().expect("GrammarCompiler inner is null"),
162            &structural_tag_str,
163        );
164        CompiledGrammar::from_unique_ptr(unique_ptr)
165    }
166
167    /// Compile a grammar object to a `CompiledGrammar`.
168    pub fn compile_grammar(
169        &mut self,
170        grammar: &grammar::Grammar,
171    ) -> CompiledGrammar {
172        cxx::let_cxx_string!(error_out_cxx = "");
173        let unique_ptr = unsafe {
174            cxx_utils::compiler_compile_grammar_or_error(
175                self.inner.as_mut().expect("GrammarCompiler inner is null"),
176                grammar.ffi_ref(),
177                error_out_cxx.as_mut().get_unchecked_mut(),
178            )
179        };
180        if unique_ptr.is_null() {
181            let msg = error_out_cxx.to_string();
182            panic!("CompileGrammar threw: {}", msg);
183        }
184        CompiledGrammar::from_unique_ptr(unique_ptr)
185    }
186
187    /// Compile a grammar from an EBNF string. The string should follow the format described in
188    /// <https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md>
189    ///
190    /// Parameters
191    /// - `ebnf_string`: The grammar string in EBNF format.
192    /// - `root_rule_name`: The name of the root rule in the grammar.
193    pub fn compile_grammar_from_ebnf(
194        &mut self,
195        ebnf_string: &str,
196        root_rule_name: &str,
197    ) -> CompiledGrammar {
198        let grammar = grammar::Grammar::from_ebnf(ebnf_string, root_rule_name);
199        self.compile_grammar(&grammar)
200    }
201
202    /// Clear all cached compiled grammars.
203    pub fn clear_cache(&mut self) {
204        self.inner.as_mut().expect("GrammarCompiler inner is null").ClearCache();
205    }
206
207    /// The approximate memory usage of the cache in bytes.
208    pub fn get_cache_size_bytes(&self) -> i64 {
209        self.inner.as_ref().expect("GrammarCompiler inner is null").GetCacheSizeBytes().into()
210    }
211
212    /// The maximum memory usage for the cache in bytes. Returns -1 if unlimited.
213    pub fn cache_limit_bytes(&self) -> i64 {
214        self.inner.as_ref().expect("GrammarCompiler inner is null").CacheLimitBytes().into()
215    }
216}
217
218impl Drop for GrammarCompiler {
219    fn drop(&mut self) {
220    }
221}