llguidance/api.rs
1use std::fmt::{Debug, Display};
2
3use anyhow::{bail, Result};
4use derivre::RegexAst;
5use serde::{Deserialize, Serialize};
6use serde_json::{json, Value};
7
8use crate::{
9 earley::{lexerspec::LexerSpec, Grammar},
10 regex_to_lark,
11};
12
13pub use crate::earley::ValidationResult;
14
15/// This represents a collection of grammars, with a designated
16/// "start" grammar at first position.
17/// Grammars can refer to each other via GenGrammar nodes.
18#[derive(Serialize, Deserialize, Clone, Debug)]
19pub struct TopLevelGrammar {
20 pub grammars: Vec<GrammarWithLexer>,
21 pub max_tokens: Option<usize>,
22}
23
24#[allow(clippy::large_enum_variant)]
25#[derive(Clone)]
26pub enum GrammarInit {
27 Serialized(TopLevelGrammar),
28 Internal(Grammar, LexerSpec),
29}
30
31/// cbindgen:ignore
32pub const DEFAULT_CONTEXTUAL: bool = true;
33
34/// In lark syntax, this can be specified as JSON object after '%llguidance' declaration in the grammar.
35#[derive(Debug, Clone, Default, Serialize, Deserialize)]
36pub struct LLGuidanceOptions {
37 /// Normally, when a sequence of bytes is forced by grammar, it is tokenized
38 /// canonically and forced as tokens.
39 /// With `no_forcing`, we let the model decide on tokenization.
40 /// This generally reduces both quality and speed, so should not be used
41 /// outside of testing.
42 #[serde(default)]
43 pub no_forcing: bool,
44
45 /// If set, the grammar will allow invalid utf8 byte sequences.
46 /// Any Unicode regex will cause an error.
47 /// This is very unlikely what you need.
48 #[serde(default)]
49 pub allow_invalid_utf8: bool,
50
51 /// If set, the grammar will allow the %ignore lexeme at the start of the grammar.
52 /// Otherwise, it will only be allowed after the first non-ignored lexeme.
53 /// This option (like the other options here) will apply to the entire grammar,
54 /// including nested sub-grammars.
55 #[serde(default)]
56 pub allow_initial_skip: bool,
57}
58
59impl LLGuidanceOptions {
60 pub fn apply(&mut self, other: &LLGuidanceOptions) {
61 if other.no_forcing {
62 self.no_forcing = true;
63 }
64 if other.allow_invalid_utf8 {
65 self.allow_invalid_utf8 = true;
66 }
67 if other.allow_initial_skip {
68 self.allow_initial_skip = true;
69 }
70 }
71}
72
73#[derive(Serialize, Deserialize, Clone, Default)]
74pub struct GrammarWithLexer {
75 /// The name of this grammar, can be used in GenGrammar nodes.
76 pub name: Option<String>,
77
78 /// The JSON schema that the grammar should generate.
79 /// When this is set, nodes and rx_nodes must be empty.
80 pub json_schema: Option<Value>,
81
82 /// The Lark grammar that the grammar should generate.
83 /// When this is set, nodes and rx_nodes must be empty.
84 pub lark_grammar: Option<String>,
85 // #[serde(flatten)]
86 // pub options: LLGuidanceOptions,
87}
88
89impl Debug for GrammarWithLexer {
90 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91 write!(
92 f,
93 "GrammarWithLexer [{}]",
94 if self.lark_grammar.is_some() {
95 "lark"
96 } else {
97 "json"
98 }
99 )
100 }
101}
102
103// /// If false, all other lexemes are excluded when this lexeme is recognized.
104// /// This is normal behavior for keywords in programming languages.
105// /// Set to true for eg. a JSON schema with both `/"type"/` and `/"[^"]*"/` as lexemes,
106// /// or for "get"/"set" contextual keywords in C#.
107// /// Default value set in GrammarWithLexer.
108// contextual: Option<bool>,
109
110// /// It lists the allowed escape sequences, typically one of:
111// /// "nrbtf\\\"u" - to allow all JSON escapes, including \u00XX for control characters
112// /// this is the default
113// /// "nrbtf\\\"" - to disallow \u00XX control characters
114// /// "nrt\\\"" - to also disallow unusual escapes (\f and \b)
115// /// "" - to disallow all escapes
116// /// Note that \uXXXX for non-control characters (code points above U+001F) are never allowed,
117// /// as they never have to be quoted in JSON.
118// json_allowed_escapes: Option<String>,
119
120/// Optional fields allowed on any Node
121#[derive(Serialize, Deserialize, Default, Clone, PartialEq, Eq)]
122pub struct NodeProps {
123 pub max_tokens: Option<usize>,
124 pub name: Option<String>,
125 pub capture_name: Option<String>,
126}
127
128#[derive(Clone)]
129pub struct GenOptions {
130 /// Regular expression matching the body of generation.
131 pub body_rx: RegexAst,
132
133 /// The whole generation must match `body_rx + stop_rx`.
134 /// Whatever matched `stop_rx` is discarded.
135 /// If `stop_rx` is empty, it's assumed to be EOS.
136 pub stop_rx: RegexAst,
137
138 /// When set, the string matching `stop_rx` will be output as a capture
139 /// with the given name.
140 pub stop_capture_name: Option<String>,
141
142 /// Lazy gen()s take the shortest match. Non-lazy take the longest.
143 /// If not specified, the gen() is lazy if stop_rx is non-empty.
144 pub lazy: Option<bool>,
145
146 /// Treat stop_rx as suffix, i.e., do not hide it from the LLM
147 /// (but do not include it in the capture).
148 pub is_suffix: Option<bool>,
149
150 /// Override sampling temperature.
151 pub temperature: Option<f32>,
152}
153
154#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
155pub struct GenGrammarOptions {
156 pub grammar: GrammarId,
157
158 /// Override sampling temperature.
159 pub temperature: Option<f32>,
160}
161
162#[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Debug)]
163#[serde(untagged)]
164pub enum GrammarId {
165 Name(String),
166}
167
168impl Display for GrammarId {
169 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170 match self {
171 GrammarId::Name(s) => write!(f, "@{s}"),
172 }
173 }
174}
175
176#[derive(Serialize, Deserialize, Clone, Debug)]
177#[serde(deny_unknown_fields)]
178pub struct RegexExt {
179 /// The lexeme should accept any (possibly empty) contiguous sequence of these chunks.
180 pub substring_chunks: Option<Vec<String>>,
181 /// Similar to `substring_chunks: s.split(/\s+/)`
182 pub substring_words: Option<String>,
183 /// Similar to `substring_chunks: s.split('')`
184 pub substring_chars: Option<String>,
185}
186
187#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
188pub enum StopReason {
189 /// Parser has not emitted stop() yet.
190 NotStopped,
191 /// max_tokens limit on the total number of tokens has been reached.
192 MaxTokensTotal,
193 /// max_tokens limit on the number of tokens in the top-level parser has been reached. (no longer used)
194 MaxTokensParser,
195 /// Top-level parser indicates that no more bytes can be added.
196 NoExtension,
197 /// Top-level parser indicates that no more bytes can be added, however it was recognized late.
198 NoExtensionBias,
199 /// Top-level parser allowed EOS (as it was in an accepting state), and EOS was generated.
200 EndOfSentence,
201 /// Something went wrong with creating a nested parser.
202 InternalError,
203 /// The lexer is too complex
204 LexerTooComplex,
205 /// The parser is too complex
206 ParserTooComplex,
207}
208
209impl Display for StopReason {
210 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
211 write!(
212 f,
213 "{}",
214 serde_json::to_value(self).unwrap().as_str().unwrap()
215 )
216 }
217}
218
219impl StopReason {
220 pub fn is_ok(&self) -> bool {
221 matches!(
222 self,
223 StopReason::NotStopped
224 | StopReason::EndOfSentence
225 | StopReason::NoExtension
226 | StopReason::NoExtensionBias
227 )
228 }
229}
230
231#[derive(Clone, Serialize, Deserialize, Debug)]
232#[serde(default)]
233#[repr(C)]
234pub struct ParserLimits {
235 /// For non-ambiguous grammars, this is the maximum "branching factor" of the grammar.
236 /// For ambiguous grammars, this might get hit much quicker.
237 /// Default: 2000
238 pub max_items_in_row: usize,
239
240 /// How much "fuel" are we willing to spend to build initial lexer regex AST nodes.
241 /// Default: 1_000_000
242 /// Speed: 50k/ms
243 pub initial_lexer_fuel: u64,
244
245 /// Maximum lexer fuel for computation of the whole token mask.
246 /// Default: 200_000
247 /// Speed: 14k/ms
248 pub step_lexer_fuel: u64,
249
250 /// Number of Earley items created for the whole token mask.
251 /// Default: 50_000
252 /// Speed: 20k/ms
253 pub step_max_items: usize,
254
255 /// Maximum number of lexer states.
256 /// Affects memory consumption, but not the speed for the most part.
257 /// Default: 250_000
258 /// Speed: ~1-2kB of memory per state
259 pub max_lexer_states: usize,
260
261 /// Maximum size of the grammar (symbols in productions)
262 /// Default: 500_000 (a few megabytes of JSON)
263 pub max_grammar_size: usize,
264
265 /// If true, we'll run any extremely large regexes against the whole
266 /// trie of the tokenizer while constructing the lexer.
267 /// This reduces future mask computation time, but increases
268 /// the time it takes to construct the lexer.
269 /// Default: true
270 pub precompute_large_lexemes: bool,
271
272 /// If true, include parser state (including tokens so far) and grammar in
273 /// errors.
274 /// Default: true
275 pub verbose_errors: bool,
276}
277
278impl Default for ParserLimits {
279 fn default() -> Self {
280 Self {
281 max_items_in_row: 2000,
282 initial_lexer_fuel: 1_000_000, // fhir schema => 500k
283 step_lexer_fuel: 200_000, //
284 max_lexer_states: 250_000, //
285 max_grammar_size: 500_000, // fhir schema => 200k
286 step_max_items: 50_000, //
287 precompute_large_lexemes: true,
288 verbose_errors: true,
289 }
290 }
291}
292
293impl TopLevelGrammar {
294 pub fn from_lark_or_grammar_list(s: &str) -> Result<Self> {
295 let first_non_whitespace = s.chars().find(|c| !c.is_whitespace());
296 if first_non_whitespace.is_none() {
297 bail!("Empty grammar");
298 }
299 if first_non_whitespace == Some('{') {
300 Ok(serde_json::from_str(s)?)
301 } else {
302 Ok(TopLevelGrammar::from_lark(s.to_string()))
303 }
304 }
305
306 pub fn from_regex(rx: &str) -> Self {
307 Self::from_grammar(GrammarWithLexer::from_regex(rx))
308 }
309
310 pub fn from_lark(lark_grammar: String) -> Self {
311 Self::from_grammar(GrammarWithLexer::from_lark(lark_grammar))
312 }
313
314 pub fn from_json_schema(json_schema: Value) -> Self {
315 Self::from_grammar(GrammarWithLexer::from_json_schema(json_schema))
316 }
317
318 pub fn from_grammar(grammar: GrammarWithLexer) -> Self {
319 TopLevelGrammar {
320 grammars: vec![grammar],
321 max_tokens: None,
322 }
323 }
324
325 /// The data is of different format, depending on tag:
326 /// - "regex" - data is regular expression in rust regex format
327 /// see <https://docs.rs/regex/latest/regex/#syntax>
328 /// - "json" or "json_schema" - data is (stringifed) JSON schema
329 /// see <https://github.com/guidance-ai/llguidance/blob/main/docs/json_schema.md>
330 /// - "json_object" - equivalent to JSON schema: {"type":"object"}
331 /// - "lark" - data is grammar in a variant of Lark syntax
332 /// see <https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md>
333 /// - "llguidance" or "guidance" - data is a list of Lark or JSON schemas in JSON format
334 pub fn from_tagged_str(tag: &str, data: &str) -> Result<Self> {
335 match tag {
336 "regex" => Ok(Self::from_regex(data)),
337 "json" | "json_schema" => Ok(Self::from_json_schema(serde_json::from_str(data)?)),
338 "json_object" => Ok(Self::from_json_schema(json!({"type": "object"}))),
339 "lark" => Ok(Self::from_lark(data.to_string())),
340 "llguidance" | "guidance" => Self::from_lark_or_grammar_list(data),
341 _ => bail!("unknown constraint type: {tag}"),
342 }
343 }
344}
345
346impl GrammarWithLexer {
347 pub fn from_lark(lark_grammar: String) -> Self {
348 GrammarWithLexer {
349 name: Some("lark_grammar".to_string()),
350 lark_grammar: Some(lark_grammar),
351 ..GrammarWithLexer::default()
352 }
353 }
354
355 pub fn from_json_schema(json_schema: Value) -> Self {
356 GrammarWithLexer {
357 name: Some("json_schema".to_string()),
358 json_schema: Some(json_schema),
359 ..GrammarWithLexer::default()
360 }
361 }
362
363 pub fn from_regex(rx: &str) -> Self {
364 let rx = regex_to_lark(rx, "");
365 let mut r = Self::from_lark(format!("start: /{rx}/"));
366 r.name = Some("regex".to_string());
367 r
368 }
369}