Skip to main content

wordchipper_cli_util/lexers/
lexer_inventory.rs

1use wordchipper::{
2    pretrained::openai::{
3        OA_CL100K_BASE_PATTERN,
4        OA_GPT2_PATTERN,
5        OA_O200K_BASE_PATTERN,
6    },
7    spanners::span_lexers::accelerators::get_regex_accelerator,
8};
9
10/// Description of a lexer.
11#[derive(Debug, Clone, PartialEq)]
12pub struct LexerDescription {
13    pub schema: String,
14    pub aliases: Vec<String>,
15    pub description: String,
16    pub pattern: String,
17    pub accelerated: bool,
18}
19
20impl LexerDescription {
21    /// Create a new lexer description.
22    pub fn new(
23        schema: &str,
24        aliases: &[&str],
25        description: &str,
26        pattern: &str,
27    ) -> Self {
28        let accelerated = get_regex_accelerator(pattern).is_some();
29        assert_ne!(aliases.len(), 0);
30        Self {
31            schema: schema.to_string(),
32            aliases: aliases.iter().map(|a| a.to_string()).collect(),
33            description: description.to_string(),
34            pattern: pattern.to_string(),
35            accelerated,
36        }
37    }
38
39    pub fn name(&self) -> &str {
40        &self.aliases[0]
41    }
42
43    pub fn id(&self) -> String {
44        format!("{}::{}", self.schema, self.name())
45    }
46}
47
48/// Inventory of lexers.
49#[derive(Debug, Clone, PartialEq)]
50pub struct LexerInventory {
51    pub lexers: Vec<LexerDescription>,
52}
53
54impl LexerInventory {
55    /// Build the lexer inventory.
56    pub fn build() -> Self {
57        let mut lexers = vec![
58            LexerDescription::new(
59                "openai",
60                &["gpt2", "r50k", "p50k"],
61                "OpenAI's gpt2/r50k/p50k model pattern.",
62                OA_GPT2_PATTERN.as_str(),
63            ),
64            LexerDescription::new(
65                "openai",
66                &["cl100k", "cl100k_base", "cl100k_edit"],
67                "OpenAI's cl100k model pattern.",
68                OA_CL100K_BASE_PATTERN.as_str(),
69            ),
70            LexerDescription::new(
71                "openai",
72                &["o200k", "o200k_base", "o200k_harmony"],
73                "OpenAI's o200k model pattern.",
74                OA_O200K_BASE_PATTERN.as_str(),
75            ),
76        ];
77        lexers.sort_by_key(|a| a.id());
78        Self { lexers }
79    }
80
81    /// Find a lexer by name.
82    pub fn find_model(
83        &self,
84        query: &str,
85    ) -> Option<&LexerDescription> {
86        if query.contains(":") {
87            let (schema, name) = query.split_once(":")?;
88            for lexer in &self.lexers {
89                if lexer.schema == schema && lexer.aliases.iter().any(|a| a == name) {
90                    return Some(lexer);
91                }
92            }
93        }
94        self.lexers
95            .iter()
96            .find(|&lexer| lexer.aliases.iter().any(|a| a == query))
97            .map(|v| v as _)
98    }
99}