dynamo_llm/model_card/
model.rs

1// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! # Model Deployment Card
17//!
18//! The ModelDeploymentCard (MDC) is the primary model configuration structure that will be available to any
19//! component that needs to interact with the model or its dependent artifacts.
20//!
21//! The ModelDeploymentCard contains LLM model deployment configuration information:
22//! - Display name and service name for the model
23//! - Model information (ModelInfoType)
24//! - Tokenizer configuration (TokenizerKind)
25//! - Prompt formatter settings (PromptFormatterArtifact)
26//! - Various metadata like revision, publish time, etc.
27
28use std::fmt;
29use std::fs::File;
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32use std::time::Duration;
33
34use anyhow::{Context, Result};
35use derive_builder::Builder;
36use dynamo_runtime::slug::Slug;
37use dynamo_runtime::transports::nats;
38use either::Either;
39use serde::{Deserialize, Serialize};
40use tokenizers::Tokenizer as HfTokenizer;
41use url::Url;
42
43use crate::gguf::{Content, ContentConfig};
44use crate::key_value_store::Versioned;
45use crate::protocols::TokenIdType;
46
47pub const BUCKET_NAME: &str = "mdc";
48
49/// Delete model deployment cards that haven't been re-published after this long.
50/// Cleans up if the worker stopped.
51pub const BUCKET_TTL: Duration = Duration::from_secs(5 * 60);
52
53/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
54const CARD_MAX_AGE: chrono::TimeDelta = chrono::TimeDelta::minutes(5);
55
56#[derive(Serialize, Deserialize, Clone, Debug)]
57#[serde(rename_all = "snake_case")]
58pub enum ModelInfoType {
59    HfConfigJson(String),
60    GGUF(PathBuf),
61}
62
63#[derive(Serialize, Deserialize, Clone, Debug)]
64#[serde(rename_all = "snake_case")]
65pub enum TokenizerKind {
66    HfTokenizerJson(String),
67    GGUF(Box<HfTokenizer>),
68}
69
70/// Supported types of prompt formatters.
71///
72/// We need a way to associate the prompt formatter template definition with an associated
73/// data model which is expected for rendering.
74///
75/// All current prompt formatters are Jinja2 templates which use the OpenAI ChatCompletionRequest
76/// format. However, we currently do not have a discovery path to know if the model supports tool use
77/// unless we inspect the template.
78///
79/// TODO(): Add an enum for the PromptFormatDataModel with at minimum arms for:
80/// - OaiChat
81/// - OaiChatToolUse
82#[derive(Serialize, Deserialize, Clone, Debug)]
83#[serde(rename_all = "snake_case")]
84pub enum PromptFormatterArtifact {
85    HfTokenizerConfigJson(String),
86    GGUF(PathBuf),
87}
88
89#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash)]
90#[serde(rename_all = "snake_case")]
91pub enum PromptContextMixin {
92    /// Support OAI Chat Messages and Tools
93    OaiChat,
94
95    /// Enables templates with `{{datetime}}` to be rendered with the current date and time.
96    Llama3DateTime,
97}
98
99#[derive(Serialize, Deserialize, Clone, Debug, Builder, Default)]
100pub struct ModelDeploymentCard {
101    /// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
102    pub display_name: String,
103
104    /// Identifier to expect in OpenAI compatible HTTP request, e.g. "meta-llama/Meta-Llama-3.1-8B-Instruct"
105    /// This will get slugified for use in NATS.
106    pub service_name: String,
107
108    /// Model information
109    pub model_info: Option<ModelInfoType>,
110
111    /// Tokenizer configuration
112    pub tokenizer: Option<TokenizerKind>,
113
114    /// Prompt Formatter configuration
115    #[serde(default, skip_serializing_if = "Option::is_none")]
116    pub prompt_formatter: Option<PromptFormatterArtifact>,
117
118    /// Prompt Formatter Config
119    #[serde(default, skip_serializing_if = "Option::is_none")]
120    pub prompt_context: Option<Vec<PromptContextMixin>>,
121
122    /// When this card was last advertised by a worker. None if not yet published.
123    pub last_published: Option<chrono::DateTime<chrono::Utc>>,
124
125    /// Incrementing count of how many times we published this card
126    #[serde(default, skip_serializing)]
127    pub revision: u64,
128
129    /// Does this model expect preprocessing (tokenization, etc) to be already done?
130    /// If this is true they get a BackendInput JSON. If this is false they get
131    /// a ChatCompletionRequest JSON.
132    #[serde(default)]
133    pub requires_preprocessing: bool,
134}
135
136impl ModelDeploymentCard {
137    pub fn builder() -> ModelDeploymentCardBuilder {
138        ModelDeploymentCardBuilder::default()
139    }
140
141    /// Create a ModelDeploymentCard where only the name is filled in.
142    ///
143    /// Single-process setups don't need an MDC to communicate model details, but it
144    /// simplifies the code to assume we always have one. This is how you get one in those
145    /// cases. A quasi-null object: <https://en.wikipedia.org/wiki/Null_object_pattern>
146    pub fn with_name_only(name: &str) -> ModelDeploymentCard {
147        ModelDeploymentCard {
148            display_name: name.to_string(),
149            service_name: Slug::from_string(name).to_string(),
150            ..Default::default()
151        }
152    }
153
154    /// A URL and NATS friendly and very likely unique ID for this model.
155    /// Mostly human readable. a-z, 0-9, _ and - only.
156    /// Pass the service_name.
157    pub fn service_name_slug(s: &str) -> Slug {
158        Slug::from_string(s)
159    }
160
161    /// How often we should check if a model deployment card expired because it's workers are gone
162    pub fn expiry_check_period() -> Duration {
163        match CARD_MAX_AGE.to_std() {
164            Ok(duration) => duration / 3,
165            Err(_) => {
166                // Only happens if CARD_MAX_AGE is negative, which it isn't
167                unreachable!("Cannot run card expiry watcher, invalid CARD_MAX_AGE");
168            }
169        }
170    }
171
172    /// Load a model deployment card from a JSON file
173    pub fn load_from_json_file<P: AsRef<Path>>(file: P) -> std::io::Result<Self> {
174        let mut card: ModelDeploymentCard = serde_json::from_str(&std::fs::read_to_string(file)?)?;
175        card.requires_preprocessing = false;
176        Ok(card)
177    }
178
179    /// Load a model deployment card from a JSON string
180    pub fn load_from_json_str(json: &str) -> Result<Self, anyhow::Error> {
181        Ok(serde_json::from_str(json)?)
182    }
183
184    //
185    // Methods
186    //
187
188    /// Save the model deployment card to a JSON file
189    pub fn save_to_json_file(&self, file: &str) -> Result<(), anyhow::Error> {
190        std::fs::write(file, self.to_json()?)?;
191        Ok(())
192    }
193
194    pub fn set_service_name(&mut self, service_name: &str) {
195        self.service_name = service_name.to_string();
196    }
197
198    pub fn slug(&self) -> Slug {
199        ModelDeploymentCard::service_name_slug(&self.service_name)
200    }
201
202    /// Serialize the model deployment card to a JSON string
203    pub fn to_json(&self) -> Result<String, anyhow::Error> {
204        Ok(serde_json::to_string(self)?)
205    }
206
207    pub fn mdcsum(&self) -> String {
208        let json = self.to_json().unwrap();
209        format!("{}", blake3::hash(json.as_bytes()))
210    }
211
212    /// Was this card last published a long time ago, suggesting the worker is gone?
213    pub fn is_expired(&self) -> bool {
214        if let Some(last_published) = self.last_published.as_ref() {
215            chrono::Utc::now() - last_published > CARD_MAX_AGE
216        } else {
217            false
218        }
219    }
220
221    pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> {
222        match &self.tokenizer {
223            Some(TokenizerKind::HfTokenizerJson(file)) => {
224                HfTokenizer::from_file(file).map_err(anyhow::Error::msg)
225            }
226            Some(TokenizerKind::GGUF(t)) => Ok(*t.clone()),
227            None => {
228                anyhow::bail!("Blank ModelDeploymentCard does not have a tokenizer");
229            }
230        }
231    }
232
233    /// Move the files this MDC uses into the NATS object store.
234    /// Updates the URI's to point to NATS.
235    pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
236        let nats_addr = nats_client.addr();
237        let bucket_name = self.slug();
238        tracing::debug!(
239            nats_addr,
240            %bucket_name,
241            "Uploading model deployment card to NATS"
242        );
243
244        if let Some(ModelInfoType::HfConfigJson(ref src_file)) = self.model_info {
245            if !nats::is_nats_url(src_file) {
246                let target = format!("nats://{nats_addr}/{bucket_name}/config.json");
247                nats_client
248                    .object_store_upload(&PathBuf::from(src_file), Url::parse(&target)?)
249                    .await?;
250                self.model_info = Some(ModelInfoType::HfConfigJson(target));
251            }
252        }
253
254        if let Some(PromptFormatterArtifact::HfTokenizerConfigJson(ref src_file)) =
255            self.prompt_formatter
256        {
257            if !nats::is_nats_url(src_file) {
258                let target = format!("nats://{nats_addr}/{bucket_name}/tokenizer_config.json");
259                nats_client
260                    .object_store_upload(&PathBuf::from(src_file), Url::parse(&target)?)
261                    .await?;
262                self.prompt_formatter =
263                    Some(PromptFormatterArtifact::HfTokenizerConfigJson(target));
264            }
265        }
266
267        if let Some(TokenizerKind::HfTokenizerJson(ref src_file)) = self.tokenizer {
268            if !nats::is_nats_url(src_file) {
269                let target = format!("nats://{nats_addr}/{bucket_name}/tokenizer.json");
270                nats_client
271                    .object_store_upload(&PathBuf::from(src_file), Url::parse(&target)?)
272                    .await?;
273                self.tokenizer = Some(TokenizerKind::HfTokenizerJson(target));
274            }
275        }
276
277        Ok(())
278    }
279
280    /// Delete this card from the key-value store and it's URLs from the object store
281    pub async fn delete_from_nats(&mut self, nats_client: nats::Client) -> Result<()> {
282        let nats_addr = nats_client.addr();
283        let bucket_name = self.slug();
284        tracing::trace!(
285            nats_addr,
286            %bucket_name,
287            "Delete model deployment card from NATS"
288        );
289        nats_client
290            .object_store_delete_bucket(bucket_name.as_ref())
291            .await
292    }
293}
294
295impl Versioned for ModelDeploymentCard {
296    fn revision(&self) -> u64 {
297        self.revision
298    }
299
300    fn set_revision(&mut self, revision: u64) {
301        self.last_published = Some(chrono::Utc::now());
302        self.revision = revision;
303    }
304}
305
306impl fmt::Display for ModelDeploymentCard {
307    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
308        write!(f, "{}", self.slug())
309    }
310}
311pub trait ModelInfo: Send + Sync {
312    /// Model type
313    fn model_type(&self) -> String;
314
315    /// Token ID for the beginning of sequence
316    fn bos_token_id(&self) -> TokenIdType;
317
318    /// Token ID for the end of sequence
319    fn eos_token_ids(&self) -> Vec<TokenIdType>;
320
321    /// Maximum position embeddings / max sequence length
322    fn max_position_embeddings(&self) -> usize;
323
324    /// Vocabulary size
325    fn vocab_size(&self) -> usize;
326}
327
328impl ModelInfoType {
329    pub async fn get_model_info(&self) -> Result<Arc<dyn ModelInfo>> {
330        match self {
331            Self::HfConfigJson(info) => HFConfig::from_json_file(info).await,
332            Self::GGUF(path) => HFConfig::from_gguf(path),
333        }
334    }
335}
336
337#[derive(Debug, Clone, Serialize, Deserialize)]
338struct HFConfig {
339    bos_token_id: TokenIdType,
340
341    #[serde(with = "either::serde_untagged")]
342    eos_token_id: Either<TokenIdType, Vec<TokenIdType>>,
343
344    /// denotes the mixin to the flattened data model which can be present
345    /// in the config.json file
346    architectures: Vec<String>,
347
348    /// general model type
349    model_type: String,
350
351    /// max sequence length
352    max_position_embeddings: usize,
353
354    /// number of layers in the model
355    num_hidden_layers: usize,
356
357    /// number of attention heads in the model
358    num_attention_heads: usize,
359
360    /// Vocabulary size
361    vocab_size: usize,
362}
363
364impl HFConfig {
365    async fn from_json_file(file: &String) -> Result<Arc<dyn ModelInfo>> {
366        let contents = std::fs::read_to_string(file)?;
367        let config: Self = serde_json::from_str(&contents)?;
368        Ok(Arc::new(config))
369    }
370    fn from_gguf(gguf_file: &Path) -> Result<Arc<dyn ModelInfo>> {
371        let content = load_gguf(gguf_file)?;
372        let model_config_metadata: ContentConfig = (&content).into();
373        let num_hidden_layers =
374            content.get_metadata()[&format!("{}.block_count", content.arch())].to_u32()? as usize;
375
376        let bos_token_id = content.get_metadata()["tokenizer.ggml.bos_token_id"].to_u32()?;
377        let eos_token_id = content.get_metadata()["tokenizer.ggml.eos_token_id"].to_u32()?;
378
379        // to_vec returns a Vec that's already there, so it's cheap
380        let vocab_size = content.get_metadata()["tokenizer.ggml.tokens"]
381            .to_vec()?
382            .len();
383
384        let arch = content.arch().to_string();
385        Ok(Arc::new(HFConfig {
386            bos_token_id,
387            eos_token_id: Either::Left(eos_token_id),
388            architectures: vec![format!("{}ForCausalLM", capitalize(&arch))],
389            // "general.architecture"
390            model_type: arch,
391            // "llama.context_length"
392            max_position_embeddings: model_config_metadata.max_seq_len(),
393            // "llama.block_count"
394            num_hidden_layers,
395            // "llama.attention.head_count"
396            num_attention_heads: model_config_metadata.num_attn_heads(),
397            // "tokenizer.ggml.tokens".len()
398            vocab_size,
399        }))
400    }
401}
402
403impl ModelInfo for HFConfig {
404    fn model_type(&self) -> String {
405        self.model_type.clone()
406    }
407
408    fn bos_token_id(&self) -> TokenIdType {
409        self.bos_token_id
410    }
411
412    fn eos_token_ids(&self) -> Vec<TokenIdType> {
413        match &self.eos_token_id {
414            Either::Left(eos_token_id) => vec![*eos_token_id],
415            Either::Right(eos_token_ids) => eos_token_ids.clone(),
416        }
417    }
418
419    fn max_position_embeddings(&self) -> usize {
420        self.max_position_embeddings
421    }
422
423    fn vocab_size(&self) -> usize {
424        self.vocab_size
425    }
426}
427
428impl TokenizerKind {
429    pub fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
430        let content = load_gguf(gguf_file)?;
431        let out = crate::gguf::convert_gguf_to_hf_tokenizer(&content)
432            .with_context(|| gguf_file.display().to_string())?;
433        Ok(TokenizerKind::GGUF(Box::new(out.tokenizer)))
434    }
435}
436
437fn load_gguf(gguf_file: &Path) -> anyhow::Result<Content> {
438    let filename = gguf_file.display().to_string();
439    let mut f = File::open(gguf_file).with_context(|| filename.clone())?;
440    // vec because GGUF can be split into multiple files (shards)
441    let mut readers = vec![&mut f];
442    crate::gguf::Content::from_readers(&mut readers).with_context(|| filename.clone())
443}
444
445fn capitalize(s: &str) -> String {
446    s.chars()
447        .enumerate()
448        .map(|(i, c)| {
449            if i == 0 {
450                c.to_uppercase().to_string()
451            } else {
452                c.to_lowercase().to_string()
453            }
454        })
455        .collect()
456}