Skip to main content

llguidance/
api.rs

1use std::fmt::{Debug, Display};
2
3use anyhow::{bail, Result};
4use derivre::RegexAst;
5use serde::{Deserialize, Serialize};
6use serde_json::{json, Value};
7
8use crate::{
9    earley::{lexerspec::LexerSpec, Grammar},
10    regex_to_lark,
11};
12
13pub use crate::earley::ValidationResult;
14
15/// This represents a collection of grammars, with a designated
16/// "start" grammar at first position.
17/// Grammars can refer to each other via GenGrammar nodes.
18#[derive(Serialize, Deserialize, Clone, Debug)]
19pub struct TopLevelGrammar {
20    pub grammars: Vec<GrammarWithLexer>,
21    pub max_tokens: Option<usize>,
22}
23
24#[allow(clippy::large_enum_variant)]
25#[derive(Clone)]
26pub enum GrammarInit {
27    Serialized(TopLevelGrammar),
28    Internal(Grammar, LexerSpec),
29}
30
31/// cbindgen:ignore
32pub const DEFAULT_CONTEXTUAL: bool = true;
33
34/// In lark syntax, this can be specified as JSON object after '%llguidance' declaration in the grammar.
35#[derive(Debug, Clone, Default, Serialize, Deserialize)]
36pub struct LLGuidanceOptions {
37    /// Normally, when a sequence of bytes is forced by grammar, it is tokenized
38    /// canonically and forced as tokens.
39    /// With `no_forcing`, we let the model decide on tokenization.
40    /// This generally reduces both quality and speed, so should not be used
41    /// outside of testing.
42    #[serde(default)]
43    pub no_forcing: bool,
44
45    /// If set, the grammar will allow invalid utf8 byte sequences.
46    /// Any Unicode regex will cause an error.
47    /// This is very unlikely what you need.
48    #[serde(default)]
49    pub allow_invalid_utf8: bool,
50
51    /// If set, the grammar will allow the %ignore lexeme at the start of the grammar.
52    /// Otherwise, it will only be allowed after the first non-ignored lexeme.
53    /// This option (like the other options here) will apply to the entire grammar,
54    /// including nested sub-grammars.
55    #[serde(default)]
56    pub allow_initial_skip: bool,
57}
58
59impl LLGuidanceOptions {
60    pub fn apply(&mut self, other: &LLGuidanceOptions) {
61        if other.no_forcing {
62            self.no_forcing = true;
63        }
64        if other.allow_invalid_utf8 {
65            self.allow_invalid_utf8 = true;
66        }
67        if other.allow_initial_skip {
68            self.allow_initial_skip = true;
69        }
70    }
71}
72
73#[derive(Serialize, Deserialize, Clone, Default)]
74pub struct GrammarWithLexer {
75    /// The name of this grammar, can be used in GenGrammar nodes.
76    pub name: Option<String>,
77
78    /// The JSON schema that the grammar should generate.
79    /// When this is set, nodes and rx_nodes must be empty.
80    pub json_schema: Option<Value>,
81
82    /// The Lark grammar that the grammar should generate.
83    /// When this is set, nodes and rx_nodes must be empty.
84    pub lark_grammar: Option<String>,
85    // #[serde(flatten)]
86    // pub options: LLGuidanceOptions,
87}
88
89impl Debug for GrammarWithLexer {
90    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91        write!(
92            f,
93            "GrammarWithLexer [{}]",
94            if self.lark_grammar.is_some() {
95                "lark"
96            } else {
97                "json"
98            }
99        )
100    }
101}
102
103// /// If false, all other lexemes are excluded when this lexeme is recognized.
104// /// This is normal behavior for keywords in programming languages.
105// /// Set to true for eg. a JSON schema with both `/"type"/` and `/"[^"]*"/` as lexemes,
106// /// or for "get"/"set" contextual keywords in C#.
107// /// Default value set in GrammarWithLexer.
108// contextual: Option<bool>,
109
110// /// It lists the allowed escape sequences, typically one of:
111// /// "nrbtf\\\"u" - to allow all JSON escapes, including \u00XX for control characters
112// ///     this is the default
113// /// "nrbtf\\\"" - to disallow \u00XX control characters
114// /// "nrt\\\"" - to also disallow unusual escapes (\f and \b)
115// /// "" - to disallow all escapes
116// /// Note that \uXXXX for non-control characters (code points above U+001F) are never allowed,
117// /// as they never have to be quoted in JSON.
118// json_allowed_escapes: Option<String>,
119
120/// Optional fields allowed on any Node
121#[derive(Serialize, Deserialize, Default, Clone, PartialEq, Eq)]
122pub struct NodeProps {
123    pub max_tokens: Option<usize>,
124    pub name: Option<String>,
125    pub capture_name: Option<String>,
126}
127
128#[derive(Clone)]
129pub struct GenOptions {
130    /// Regular expression matching the body of generation.
131    pub body_rx: RegexAst,
132
133    /// The whole generation must match `body_rx + stop_rx`.
134    /// Whatever matched `stop_rx` is discarded.
135    /// If `stop_rx` is empty, it's assumed to be EOS.
136    pub stop_rx: RegexAst,
137
138    /// When set, the string matching `stop_rx` will be output as a capture
139    /// with the given name.
140    pub stop_capture_name: Option<String>,
141
142    /// Lazy gen()s take the shortest match. Non-lazy take the longest.
143    /// If not specified, the gen() is lazy if stop_rx is non-empty.
144    pub lazy: Option<bool>,
145
146    /// Treat stop_rx as suffix, i.e., do not hide it from the LLM
147    /// (but do not include it in the capture).
148    pub is_suffix: Option<bool>,
149
150    /// Override sampling temperature.
151    pub temperature: Option<f32>,
152}
153
154#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
155pub struct GenGrammarOptions {
156    pub grammar: GrammarId,
157
158    /// Override sampling temperature.
159    pub temperature: Option<f32>,
160}
161
162#[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Debug)]
163#[serde(untagged)]
164pub enum GrammarId {
165    Name(String),
166}
167
168impl Display for GrammarId {
169    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170        match self {
171            GrammarId::Name(s) => write!(f, "@{s}"),
172        }
173    }
174}
175
176#[derive(Serialize, Deserialize, Clone, Debug)]
177#[serde(deny_unknown_fields)]
178pub struct RegexExt {
179    /// The lexeme should accept any (possibly empty) contiguous sequence of these chunks.
180    pub substring_chunks: Option<Vec<String>>,
181    /// Similar to `substring_chunks: s.split(/\s+/)`
182    pub substring_words: Option<String>,
183    /// Similar to `substring_chunks: s.split('')`
184    pub substring_chars: Option<String>,
185}
186
187#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
188pub enum StopReason {
189    /// Parser has not emitted stop() yet.
190    NotStopped,
191    /// max_tokens limit on the total number of tokens has been reached.
192    MaxTokensTotal,
193    /// max_tokens limit on the number of tokens in the top-level parser has been reached. (no longer used)
194    MaxTokensParser,
195    /// Top-level parser indicates that no more bytes can be added.
196    NoExtension,
197    /// Top-level parser indicates that no more bytes can be added, however it was recognized late.
198    NoExtensionBias,
199    /// Top-level parser allowed EOS (as it was in an accepting state), and EOS was generated.
200    EndOfSentence,
201    /// Something went wrong with creating a nested parser.
202    InternalError,
203    /// The lexer is too complex
204    LexerTooComplex,
205    /// The parser is too complex
206    ParserTooComplex,
207}
208
209impl Display for StopReason {
210    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
211        write!(
212            f,
213            "{}",
214            serde_json::to_value(self).unwrap().as_str().unwrap()
215        )
216    }
217}
218
219impl StopReason {
220    pub fn is_ok(&self) -> bool {
221        matches!(
222            self,
223            StopReason::NotStopped
224                | StopReason::EndOfSentence
225                | StopReason::NoExtension
226                | StopReason::NoExtensionBias
227        )
228    }
229}
230
231#[derive(Clone, Serialize, Deserialize, Debug)]
232#[serde(default)]
233#[repr(C)]
234pub struct ParserLimits {
235    /// For non-ambiguous grammars, this is the maximum "branching factor" of the grammar.
236    /// For ambiguous grammars, this might get hit much quicker.
237    /// Default: 2000
238    pub max_items_in_row: usize,
239
240    /// How much "fuel" are we willing to spend to build initial lexer regex AST nodes.
241    /// Default: 1_000_000
242    /// Speed: 50k/ms
243    pub initial_lexer_fuel: u64,
244
245    /// Maximum lexer fuel for computation of the whole token mask.
246    /// Default: 200_000
247    /// Speed: 14k/ms
248    pub step_lexer_fuel: u64,
249
250    /// Number of Earley items created for the whole token mask.
251    /// Default: 50_000
252    /// Speed: 20k/ms
253    pub step_max_items: usize,
254
255    /// Maximum number of lexer states.
256    /// Affects memory consumption, but not the speed for the most part.
257    /// Default: 250_000
258    /// Speed: ~1-2kB of memory per state
259    pub max_lexer_states: usize,
260
261    /// Maximum size of the grammar (symbols in productions)
262    /// Default: 500_000 (a few megabytes of JSON)
263    pub max_grammar_size: usize,
264
265    /// If true, we'll run any extremely large regexes against the whole
266    /// trie of the tokenizer while constructing the lexer.
267    /// This reduces future mask computation time, but increases
268    /// the time it takes to construct the lexer.
269    /// Default: true
270    pub precompute_large_lexemes: bool,
271
272    /// If true, include parser state (including tokens so far) and grammar in
273    /// errors.
274    /// Default: true
275    pub verbose_errors: bool,
276}
277
278impl Default for ParserLimits {
279    fn default() -> Self {
280        Self {
281            max_items_in_row: 2000,
282            initial_lexer_fuel: 1_000_000, // fhir schema => 500k
283            step_lexer_fuel: 200_000,      //
284            max_lexer_states: 250_000,     //
285            max_grammar_size: 500_000,     // fhir schema => 200k
286            step_max_items: 50_000,        //
287            precompute_large_lexemes: true,
288            verbose_errors: true,
289        }
290    }
291}
292
293impl TopLevelGrammar {
294    pub fn from_lark_or_grammar_list(s: &str) -> Result<Self> {
295        let first_non_whitespace = s.chars().find(|c| !c.is_whitespace());
296        if first_non_whitespace.is_none() {
297            bail!("Empty grammar");
298        }
299        if first_non_whitespace == Some('{') {
300            Ok(serde_json::from_str(s)?)
301        } else {
302            Ok(TopLevelGrammar::from_lark(s.to_string()))
303        }
304    }
305
306    pub fn from_regex(rx: &str) -> Self {
307        Self::from_grammar(GrammarWithLexer::from_regex(rx))
308    }
309
310    pub fn from_lark(lark_grammar: String) -> Self {
311        Self::from_grammar(GrammarWithLexer::from_lark(lark_grammar))
312    }
313
314    pub fn from_json_schema(json_schema: Value) -> Self {
315        Self::from_grammar(GrammarWithLexer::from_json_schema(json_schema))
316    }
317
318    pub fn from_grammar(grammar: GrammarWithLexer) -> Self {
319        TopLevelGrammar {
320            grammars: vec![grammar],
321            max_tokens: None,
322        }
323    }
324
325    /// The data is of different format, depending on tag:
326    /// - "regex" - data is regular expression in rust regex format
327    ///   see <https://docs.rs/regex/latest/regex/#syntax>
328    /// - "json" or "json_schema" - data is (stringifed) JSON schema
329    ///   see <https://github.com/guidance-ai/llguidance/blob/main/docs/json_schema.md>
330    /// - "json_object" - equivalent to JSON schema: {"type":"object"}
331    /// - "lark" - data is grammar in a variant of Lark syntax
332    ///   see <https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md>
333    /// - "llguidance" or "guidance" - data is a list of Lark or JSON schemas in JSON format
334    pub fn from_tagged_str(tag: &str, data: &str) -> Result<Self> {
335        match tag {
336            "regex" => Ok(Self::from_regex(data)),
337            "json" | "json_schema" => Ok(Self::from_json_schema(serde_json::from_str(data)?)),
338            "json_object" => Ok(Self::from_json_schema(json!({"type": "object"}))),
339            "lark" => Ok(Self::from_lark(data.to_string())),
340            "llguidance" | "guidance" => Self::from_lark_or_grammar_list(data),
341            _ => bail!("unknown constraint type: {tag}"),
342        }
343    }
344}
345
346impl GrammarWithLexer {
347    pub fn from_lark(lark_grammar: String) -> Self {
348        GrammarWithLexer {
349            name: Some("lark_grammar".to_string()),
350            lark_grammar: Some(lark_grammar),
351            ..GrammarWithLexer::default()
352        }
353    }
354
355    pub fn from_json_schema(json_schema: Value) -> Self {
356        GrammarWithLexer {
357            name: Some("json_schema".to_string()),
358            json_schema: Some(json_schema),
359            ..GrammarWithLexer::default()
360        }
361    }
362
363    pub fn from_regex(rx: &str) -> Self {
364        let rx = regex_to_lark(rx, "");
365        let mut r = Self::from_lark(format!("start: /{rx}/"));
366        r.name = Some("regex".to_string());
367        r
368    }
369}