wordchipper_cli_util/lexers/
lexer_inventory.rs1use wordchipper::{
2 pretrained::openai::{
3 OA_CL100K_BASE_PATTERN,
4 OA_GPT2_PATTERN,
5 OA_O200K_BASE_PATTERN,
6 },
7 spanners::span_lexers::accelerators::get_regex_accelerator,
8};
9
10#[derive(Debug, Clone, PartialEq)]
12pub struct LexerDescription {
13 pub schema: String,
14 pub aliases: Vec<String>,
15 pub description: String,
16 pub pattern: String,
17 pub accelerated: bool,
18}
19
20impl LexerDescription {
21 pub fn new(
23 schema: &str,
24 aliases: &[&str],
25 description: &str,
26 pattern: &str,
27 ) -> Self {
28 let accelerated = get_regex_accelerator(pattern).is_some();
29 assert_ne!(aliases.len(), 0);
30 Self {
31 schema: schema.to_string(),
32 aliases: aliases.iter().map(|a| a.to_string()).collect(),
33 description: description.to_string(),
34 pattern: pattern.to_string(),
35 accelerated,
36 }
37 }
38
39 pub fn name(&self) -> &str {
40 &self.aliases[0]
41 }
42
43 pub fn id(&self) -> String {
44 format!("{}::{}", self.schema, self.name())
45 }
46}
47
48#[derive(Debug, Clone, PartialEq)]
50pub struct LexerInventory {
51 pub lexers: Vec<LexerDescription>,
52}
53
54impl LexerInventory {
55 pub fn build() -> Self {
57 let mut lexers = vec![
58 LexerDescription::new(
59 "openai",
60 &["gpt2", "r50k", "p50k"],
61 "OpenAI's gpt2/r50k/p50k model pattern.",
62 OA_GPT2_PATTERN.as_str(),
63 ),
64 LexerDescription::new(
65 "openai",
66 &["cl100k", "cl100k_base", "cl100k_edit"],
67 "OpenAI's cl100k model pattern.",
68 OA_CL100K_BASE_PATTERN.as_str(),
69 ),
70 LexerDescription::new(
71 "openai",
72 &["o200k", "o200k_base", "o200k_harmony"],
73 "OpenAI's o200k model pattern.",
74 OA_O200K_BASE_PATTERN.as_str(),
75 ),
76 ];
77 lexers.sort_by_key(|a| a.id());
78 Self { lexers }
79 }
80
81 pub fn find_model(
83 &self,
84 query: &str,
85 ) -> Option<&LexerDescription> {
86 if query.contains(":") {
87 let (schema, name) = query.split_once(":")?;
88 for lexer in &self.lexers {
89 if lexer.schema == schema && lexer.aliases.iter().any(|a| a == name) {
90 return Some(lexer);
91 }
92 }
93 }
94 self.lexers
95 .iter()
96 .find(|&lexer| lexer.aliases.iter().any(|a| a == query))
97 .map(|v| v as _)
98 }
99}