use std::fmt::{Debug, Display};
use anyhow::{bail, Result};
use derivre::RegexAst;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use crate::{
earley::{lexerspec::LexerSpec, Grammar},
regex_to_lark,
};
pub use crate::earley::ValidationResult;
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct TopLevelGrammar {
pub grammars: Vec<GrammarWithLexer>,
pub max_tokens: Option<usize>,
}
#[allow(clippy::large_enum_variant)]
#[derive(Clone)]
pub enum GrammarInit {
Serialized(TopLevelGrammar),
Internal(Grammar, LexerSpec),
}
pub const DEFAULT_CONTEXTUAL: bool = true;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct LLGuidanceOptions {
#[serde(default)]
pub no_forcing: bool,
#[serde(default)]
pub allow_invalid_utf8: bool,
#[serde(default)]
pub allow_initial_skip: bool,
}
impl LLGuidanceOptions {
pub fn apply(&mut self, other: &LLGuidanceOptions) {
if other.no_forcing {
self.no_forcing = true;
}
if other.allow_invalid_utf8 {
self.allow_invalid_utf8 = true;
}
if other.allow_initial_skip {
self.allow_initial_skip = true;
}
}
}
#[derive(Serialize, Deserialize, Clone, Default)]
pub struct GrammarWithLexer {
pub name: Option<String>,
pub json_schema: Option<Value>,
pub lark_grammar: Option<String>,
}
impl Debug for GrammarWithLexer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"GrammarWithLexer [{}]",
if self.lark_grammar.is_some() {
"lark"
} else {
"json"
}
)
}
}
#[derive(Serialize, Deserialize, Default, Clone, PartialEq, Eq)]
pub struct NodeProps {
pub max_tokens: Option<usize>,
pub name: Option<String>,
pub capture_name: Option<String>,
}
#[derive(Clone)]
pub struct GenOptions {
pub body_rx: RegexAst,
pub stop_rx: RegexAst,
pub stop_capture_name: Option<String>,
pub lazy: Option<bool>,
pub is_suffix: Option<bool>,
pub temperature: Option<f32>,
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub struct GenGrammarOptions {
pub grammar: GrammarId,
pub temperature: Option<f32>,
}
#[derive(Serialize, Deserialize, Hash, PartialEq, Eq, Clone, Debug)]
#[serde(untagged)]
pub enum GrammarId {
Name(String),
}
impl Display for GrammarId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
GrammarId::Name(s) => write!(f, "@{s}"),
}
}
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(deny_unknown_fields)]
pub struct RegexExt {
pub substring_chunks: Option<Vec<String>>,
pub substring_words: Option<String>,
pub substring_chars: Option<String>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum StopReason {
NotStopped,
MaxTokensTotal,
MaxTokensParser,
NoExtension,
NoExtensionBias,
EndOfSentence,
InternalError,
LexerTooComplex,
ParserTooComplex,
}
impl Display for StopReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}",
serde_json::to_value(self).unwrap().as_str().unwrap()
)
}
}
impl StopReason {
pub fn is_ok(&self) -> bool {
matches!(
self,
StopReason::NotStopped
| StopReason::EndOfSentence
| StopReason::NoExtension
| StopReason::NoExtensionBias
)
}
}
#[derive(Clone, Serialize, Deserialize, Debug)]
#[serde(default)]
#[repr(C)]
pub struct ParserLimits {
pub max_items_in_row: usize,
pub initial_lexer_fuel: u64,
pub step_lexer_fuel: u64,
pub step_max_items: usize,
pub max_lexer_states: usize,
pub max_grammar_size: usize,
pub precompute_large_lexemes: bool,
pub verbose_errors: bool,
}
impl Default for ParserLimits {
fn default() -> Self {
Self {
max_items_in_row: 2000,
initial_lexer_fuel: 1_000_000, step_lexer_fuel: 200_000, max_lexer_states: 250_000, max_grammar_size: 500_000, step_max_items: 50_000, precompute_large_lexemes: true,
verbose_errors: true,
}
}
}
impl TopLevelGrammar {
pub fn from_lark_or_grammar_list(s: &str) -> Result<Self> {
let first_non_whitespace = s.chars().find(|c| !c.is_whitespace());
if first_non_whitespace.is_none() {
bail!("Empty grammar");
}
if first_non_whitespace == Some('{') {
Ok(serde_json::from_str(s)?)
} else {
Ok(TopLevelGrammar::from_lark(s.to_string()))
}
}
pub fn from_regex(rx: &str) -> Self {
Self::from_grammar(GrammarWithLexer::from_regex(rx))
}
pub fn from_lark(lark_grammar: String) -> Self {
Self::from_grammar(GrammarWithLexer::from_lark(lark_grammar))
}
pub fn from_json_schema(json_schema: Value) -> Self {
Self::from_grammar(GrammarWithLexer::from_json_schema(json_schema))
}
pub fn from_grammar(grammar: GrammarWithLexer) -> Self {
TopLevelGrammar {
grammars: vec![grammar],
max_tokens: None,
}
}
pub fn from_tagged_str(tag: &str, data: &str) -> Result<Self> {
match tag {
"regex" => Ok(Self::from_regex(data)),
"json" | "json_schema" => Ok(Self::from_json_schema(serde_json::from_str(data)?)),
"json_object" => Ok(Self::from_json_schema(json!({"type": "object"}))),
"lark" => Ok(Self::from_lark(data.to_string())),
"llguidance" | "guidance" => Self::from_lark_or_grammar_list(data),
_ => bail!("unknown constraint type: {tag}"),
}
}
}
impl GrammarWithLexer {
pub fn from_lark(lark_grammar: String) -> Self {
GrammarWithLexer {
name: Some("lark_grammar".to_string()),
lark_grammar: Some(lark_grammar),
..GrammarWithLexer::default()
}
}
pub fn from_json_schema(json_schema: Value) -> Self {
GrammarWithLexer {
name: Some("json_schema".to_string()),
json_schema: Some(json_schema),
..GrammarWithLexer::default()
}
}
pub fn from_regex(rx: &str) -> Self {
let rx = regex_to_lark(rx, "");
let mut r = Self::from_lark(format!("start: /{rx}/"));
r.name = Some("regex".to_string());
r
}
}