1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/// Pre-quantized Qwen 2.5 1.5B Instruct (GGUF Q4_K_M) — lightest mobile option (~941 MB).
/// Fits comfortably on both iOS (iPhone 16e, 8 GB RAM) and Android memory-constrained devices.
/// Note: the 3B variant (~1.93 GB) caused OOM on iPhone 16e because iOS gives apps only ~2-3 GB;
/// the 1.5B variant at ~941 MB leaves comfortable headroom for KV cache, activations, and the app.
/// bartowski's GGUF embeds the full tokenizer and Qwen2.5 chat template, so on iOS/macOS no
/// separate tok_model_id download is needed. On Android an explicit tok_model_id is required.
pub const BARTOWSKI_QWEN25_1_5B_INSTRUCT_GGUF: &str = "bartowski/Qwen2.5-1.5B-Instruct-GGUF";
/// The specific GGUF filename to download from the bartowski 1.5B repo.
pub const QWEN25_1_5B_GGUF_FILE: &str = "Qwen2.5-1.5B-Instruct-Q4_K_M.gguf";
/// Base model repo used for the HF tokenizer on Android.
/// On Android, GGUF loading requires an explicit tokenizer source; on iOS/macOS the tokenizer
/// embedded in the GGUF file is used instead to avoid an extra network download.
pub const QWEN25_1_5B_TOK_MODEL_ID: &str = "Qwen/Qwen2.5-1.5B-Instruct";
/// Pre-quantized Qwen 2.5 Coder 1.5B Instruct (GGUF Q4_K_M) — dedicated coding model (~941 MB).
///
/// Uses the `qwen2` GGUF architecture, identical to Qwen2.5-1.5B-Instruct, so it loads through
/// the existing `quantized_qwen.rs` path in mistral.rs without any code changes.
///
/// Strongly preferred over the general-purpose 1.5B for coding tasks: trained on 5.5T tokens of
/// code and math data, with fill-in-the-middle (FIM) and repo-level code understanding.
/// Same memory footprint as the general 1.5B (~941 MB) but dramatically better at code.
///
/// bartowski's GGUF embeds the full tokenizer and chat template, so on iOS/macOS no separate
/// tok_model_id download is needed. On Android an explicit tok_model_id is required.
pub const BARTOWSKI_QWEN25_CODER_1_5B_INSTRUCT_GGUF: &str =
"bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF";
/// The specific GGUF filename to download from the bartowski Coder 1.5B repo.
pub const QWEN25_CODER_1_5B_GGUF_FILE: &str = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf";
/// Base model repo used for the HF tokenizer on Android.
pub const QWEN25_CODER_1_5B_TOK_MODEL_ID: &str = "Qwen/Qwen2.5-Coder-1.5B-Instruct";
/// HuggingFace repo for the full-precision Qwen 2.5 Coder 7B Instruct model.
///
/// Used by ISQ pipelines (`TextModelBuilder`) which download the safetensors weights
/// directly and quantise them in-situ on the device. Unlike the bartowski GGUF
/// variants, this repo ships the original BF16 weights; mistral.rs handles
/// quantisation to Q4K or Q8_0 at load time via the `--isq` flag.
///
/// Requires ~8 GB RAM during the load phase (4-bit); ~12 GB for 8-bit.
/// Metal-accelerated on macOS; CPU fallback available but very slow.
pub const QWEN25_CODER_7B_INSTRUCT: &str = "Qwen/Qwen2.5-Coder-7B-Instruct";
/// Pre-quantized Qwen 2.5 Coder 3B Instruct (GGUF Q4_K_M) — best coding quality on macOS (~1.93 GB).
///
/// Same `qwen2` architecture as the 3B general model; ideal for macOS desktops where the extra
/// quality headroom over the 1.5B is worthwhile. Not recommended for iOS (OOM risk).
pub const BARTOWSKI_QWEN25_CODER_3B_INSTRUCT_GGUF: &str =
"bartowski/Qwen2.5-Coder-3B-Instruct-GGUF";
/// The specific GGUF filename to download from the bartowski Coder 3B repo.
pub const QWEN25_CODER_3B_GGUF_FILE: &str = "Qwen2.5-Coder-3B-Instruct-Q4_K_M.gguf";
/// Base model repo used for the HF tokenizer on Android.
pub const QWEN25_CODER_3B_TOK_MODEL_ID: &str = "Qwen/Qwen2.5-Coder-3B-Instruct";
/// Pre-quantized Qwen 2.5 3B Instruct (GGUF Q4_K_M) — balanced option (~1.93 GB).
/// Ideal for macOS desktops and Android devices with sufficient RAM.
/// Not recommended as default on iOS: the 3B variant caused OOM on iPhone 16e (8 GB RAM)
/// because iOS gives apps only ~2-3 GB; use the 1.5B variant for iOS instead.
/// No in-situ quantization needed; loads directly at quantized size.
pub const BARTOWSKI_QWEN25_3B_INSTRUCT_GGUF: &str = "bartowski/Qwen2.5-3B-Instruct-GGUF";
/// The specific GGUF filename to download from the bartowski 3B repo.
pub const QWEN25_3B_GGUF_FILE: &str = "Qwen2.5-3B-Instruct-Q4_K_M.gguf";
/// Base model repo used for the HF tokenizer (tokenizer.json + tokenizer_config.json).
pub const QWEN25_3B_TOK_MODEL_ID: &str = "Qwen/Qwen2.5-3B-Instruct";
/// Pre-quantized Qwen 3 4B Instruct (GGUF Q4_K_M) — full OpenAI-compatible tool calling (~2.7 GB).
///
/// Qwen 3 uses an extended thinking mode (`<think>…</think>`) that significantly improves
/// reasoning and tool-use accuracy. Load with `max_tokens ≥ 4096` to avoid empty replies caused
/// by the model exhausting its token budget on thinking before producing a response.
///
/// Recommended model for siGit Code (coding agent with tool calling on macOS/Linux/Windows).
pub const BARTOWSKI_QWEN3_4B_GGUF: &str = "bartowski/Qwen_Qwen3-4B-GGUF";
/// The specific GGUF filename to download from the bartowski Qwen 3 4B repo.
pub const QWEN3_4B_GGUF_FILE: &str = "Qwen_Qwen3-4B-Q4_K_M.gguf";
/// All model IDs that the Onde inference engine supports.
/// Used by `list_local_hf_models` to filter the HuggingFace cache
/// to only show models that can actually be used for generation.
pub const SUPPORTED_MODELS: & = &;
/// Rich metadata for a supported model, used by the frontend to display
/// unavailable models that can be downloaded.
/// Complete list of supported models with display metadata.
///
/// When adding a new model, add its constant ID to [`SUPPORTED_MODELS`] **and**
/// a corresponding entry here so the frontend can show it in the model list UI.
pub const SUPPORTED_MODEL_INFO: & = &;