Skip to main content

codex_asr/
lib.rs

1use std::env;
2use std::fs;
3use std::path::{Path, PathBuf};
4use std::process::Command;
5use std::time::Duration;
6
7use base64::prelude::*;
8use reqwest::blocking::{multipart, Client};
9use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION, USER_AGENT};
10use serde::Deserialize;
11use serde_json::Value;
12use thiserror::Error;
13
14pub const DEFAULT_ENDPOINT: &str = "https://chatgpt.com/backend-api/transcribe";
15pub const DEFAULT_ORIGINATOR: &str = "Codex Desktop";
16const DEFAULT_DESKTOP_VERSION: &str = "26.429.30905";
17
18#[derive(Debug, Error)]
19pub enum CodexAsrError {
20    #[error("failed to read {path}: {source}")]
21    ReadFile {
22        path: PathBuf,
23        #[source]
24        source: std::io::Error,
25    },
26    #[error("failed to parse {path}: {source}")]
27    ParseAuth {
28        path: PathBuf,
29        #[source]
30        source: serde_json::Error,
31    },
32    #[error("Codex auth at {path} does not contain a ChatGPT access token")]
33    MissingAccessToken { path: PathBuf },
34    #[error("Codex auth mode is {mode}, not ChatGPT token auth")]
35    UnsupportedAuthMode { mode: String },
36    #[error("invalid bearer token")]
37    InvalidBearer,
38    #[error("invalid header value: {0}")]
39    InvalidHeader(#[from] reqwest::header::InvalidHeaderValue),
40    #[error("failed to build HTTP client: {0}")]
41    BuildClient(#[source] reqwest::Error),
42    #[error("failed to build multipart request: {0}")]
43    BuildMultipart(#[source] reqwest::Error),
44    #[error("transcribe request failed: {0}")]
45    Request(#[source] reqwest::Error),
46    #[error("transcribe request failed with HTTP {status}: {body}")]
47    Http { status: u16, body: String },
48    #[error("transcribe response did not contain text")]
49    MissingText,
50}
51
52pub type Result<T> = std::result::Result<T, CodexAsrError>;
53
54#[derive(Debug, Clone)]
55pub struct CodexAuth {
56    pub access_token: String,
57    pub account_id: Option<String>,
58    pub path: Option<PathBuf>,
59}
60
61impl CodexAuth {
62    pub fn from_bearer(token: impl AsRef<str>, account_id: Option<String>) -> Result<Self> {
63        let access_token =
64            strip_bearer_prefix(token.as_ref()).ok_or(CodexAsrError::InvalidBearer)?;
65        let account_id = account_id.or_else(|| account_id_from_access_token(&access_token));
66        Ok(Self {
67            access_token,
68            account_id,
69            path: None,
70        })
71    }
72
73    pub fn from_codex_home() -> Result<Self> {
74        Self::from_auth_file(default_auth_file())
75    }
76
77    pub fn from_auth_file(path: impl AsRef<Path>) -> Result<Self> {
78        let path = path.as_ref().to_path_buf();
79        let raw = fs::read_to_string(&path).map_err(|source| CodexAsrError::ReadFile {
80            path: path.clone(),
81            source,
82        })?;
83        let parsed: AuthFile =
84            serde_json::from_str(&raw).map_err(|source| CodexAsrError::ParseAuth {
85                path: path.clone(),
86                source,
87            })?;
88        let mode = parsed.auth_mode.or(parsed.auth_mode_camel);
89        if let Some(mode) = mode {
90            if mode != "chatgpt" && mode != "chatgpt_auth_tokens" {
91                return Err(CodexAsrError::UnsupportedAuthMode { mode });
92            }
93        }
94        let tokens = parsed.tokens;
95        let token = tokens
96            .as_ref()
97            .and_then(|tokens| tokens.access_token.clone())
98            .filter(|token| !token.trim().is_empty())
99            .ok_or_else(|| CodexAsrError::MissingAccessToken { path: path.clone() })?;
100        let account_id = tokens
101            .and_then(|tokens| tokens.account_id)
102            .or_else(|| account_id_from_access_token(&token));
103        Ok(Self {
104            access_token: token,
105            account_id,
106            path: Some(path),
107        })
108    }
109}
110
111#[derive(Debug, Clone)]
112pub struct CodexAsrClient {
113    endpoint: String,
114    auth: CodexAuth,
115    http: Client,
116    originator: String,
117    user_agent: String,
118}
119
120impl CodexAsrClient {
121    pub fn builder(auth: CodexAuth) -> CodexAsrClientBuilder {
122        CodexAsrClientBuilder::new(auth)
123    }
124
125    pub fn from_codex_home() -> Result<Self> {
126        Self::builder(CodexAuth::from_codex_home()?).build()
127    }
128
129    pub fn transcribe_file(
130        &self,
131        path: impl AsRef<Path>,
132        options: TranscribeOptions,
133    ) -> Result<Transcription> {
134        let path = path.as_ref();
135        let audio = fs::read(path).map_err(|source| CodexAsrError::ReadFile {
136            path: path.to_path_buf(),
137            source,
138        })?;
139        let content_type = options
140            .content_type
141            .unwrap_or_else(|| infer_content_type(path).to_string());
142        let filename = options
143            .filename
144            .unwrap_or_else(|| upload_filename(path, &content_type));
145        self.transcribe_bytes(audio, &filename, &content_type, options.language)
146    }
147
148    pub fn transcribe_bytes(
149        &self,
150        audio: Vec<u8>,
151        filename: &str,
152        content_type: &str,
153        language: Option<String>,
154    ) -> Result<Transcription> {
155        let part = multipart::Part::bytes(audio)
156            .file_name(filename.replace('"', ""))
157            .mime_str(content_type)
158            .map_err(CodexAsrError::BuildMultipart)?;
159        let mut form = multipart::Form::new().part("file", part);
160        if let Some(language) = language {
161            form = form.text("language", language);
162        }
163
164        let response = self
165            .http
166            .post(&self.endpoint)
167            .headers(self.auth_headers()?)
168            .multipart(form)
169            .send()
170            .map_err(CodexAsrError::Request)?;
171        let status = response.status();
172        let body = response.text().map_err(CodexAsrError::Request)?;
173        if !status.is_success() {
174            return Err(CodexAsrError::Http {
175                status: status.as_u16(),
176                body: clip_response_body(&body),
177            });
178        }
179        let parsed: TranscribeResponse =
180            serde_json::from_str(&body).map_err(|source| CodexAsrError::ParseAuth {
181                path: PathBuf::from("<transcribe response>"),
182                source,
183            })?;
184        let text = parsed.text.ok_or(CodexAsrError::MissingText)?;
185        Ok(Transcription { text })
186    }
187
188    fn auth_headers(&self) -> Result<HeaderMap> {
189        let mut headers = HeaderMap::new();
190        headers.insert(
191            AUTHORIZATION,
192            HeaderValue::from_str(&format!("Bearer {}", self.auth.access_token))?,
193        );
194        headers.insert("originator", HeaderValue::from_str(&self.originator)?);
195        headers.insert(USER_AGENT, HeaderValue::from_str(&self.user_agent)?);
196        if let Some(account_id) = &self.auth.account_id {
197            headers.insert("ChatGPT-Account-Id", HeaderValue::from_str(account_id)?);
198        }
199        Ok(headers)
200    }
201}
202
203#[derive(Debug, Clone)]
204pub struct CodexAsrClientBuilder {
205    auth: CodexAuth,
206    endpoint: String,
207    proxy: Option<String>,
208    timeout: Option<Duration>,
209    connect_timeout: Option<Duration>,
210    originator: String,
211    user_agent: String,
212}
213
214impl CodexAsrClientBuilder {
215    pub fn new(auth: CodexAuth) -> Self {
216        let version =
217            detect_codex_desktop_version().unwrap_or_else(|| DEFAULT_DESKTOP_VERSION.to_string());
218        Self {
219            auth,
220            endpoint: DEFAULT_ENDPOINT.to_string(),
221            proxy: resolve_proxy(None),
222            timeout: None,
223            connect_timeout: None,
224            originator: DEFAULT_ORIGINATOR.to_string(),
225            user_agent: format!(
226                "{DEFAULT_ORIGINATOR}/{version} ({}; {})",
227                env::consts::OS,
228                env::consts::ARCH
229            ),
230        }
231    }
232
233    pub fn endpoint(mut self, endpoint: impl Into<String>) -> Self {
234        self.endpoint = endpoint.into();
235        self
236    }
237
238    pub fn proxy(mut self, proxy: Option<String>) -> Self {
239        self.proxy = proxy;
240        self
241    }
242
243    pub fn timeout(mut self, timeout: Option<Duration>) -> Self {
244        self.timeout = timeout;
245        self
246    }
247
248    pub fn connect_timeout(mut self, timeout: Option<Duration>) -> Self {
249        self.connect_timeout = timeout;
250        self
251    }
252
253    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
254        self.user_agent = user_agent.into();
255        self
256    }
257
258    pub fn build(self) -> Result<CodexAsrClient> {
259        let mut builder = Client::builder();
260        if let Some(proxy) = self.proxy {
261            builder =
262                builder.proxy(reqwest::Proxy::https(&proxy).map_err(CodexAsrError::BuildClient)?);
263        }
264        if let Some(timeout) = self.timeout {
265            builder = builder.timeout(timeout);
266        }
267        if let Some(timeout) = self.connect_timeout {
268            builder = builder.connect_timeout(timeout);
269        }
270        let http = builder.build().map_err(CodexAsrError::BuildClient)?;
271        Ok(CodexAsrClient {
272            endpoint: self.endpoint,
273            auth: self.auth,
274            http,
275            originator: self.originator,
276            user_agent: self.user_agent,
277        })
278    }
279}
280
281#[derive(Debug, Clone, Default)]
282pub struct TranscribeOptions {
283    pub language: Option<String>,
284    pub content_type: Option<String>,
285    pub filename: Option<String>,
286}
287
288#[derive(Debug, Clone, PartialEq, Eq)]
289pub struct Transcription {
290    pub text: String,
291}
292
293#[derive(Debug, Deserialize)]
294struct AuthFile {
295    auth_mode: Option<String>,
296    #[serde(rename = "authMode")]
297    auth_mode_camel: Option<String>,
298    tokens: Option<AuthTokens>,
299}
300
301#[derive(Debug, Deserialize)]
302struct AuthTokens {
303    access_token: Option<String>,
304    account_id: Option<String>,
305}
306
307#[derive(Debug, Deserialize)]
308struct TranscribeResponse {
309    text: Option<String>,
310}
311
312pub fn infer_content_type(path: impl AsRef<Path>) -> &'static str {
313    match path
314        .as_ref()
315        .extension()
316        .and_then(|ext| ext.to_str())
317        .unwrap_or("")
318        .to_ascii_lowercase()
319        .as_str()
320    {
321        "wav" | "wave" => "audio/wav",
322        "webm" => "audio/webm",
323        "mp3" => "audio/mpeg",
324        "m4a" | "mp4" => "audio/mp4",
325        "ogg" | "oga" => "audio/ogg",
326        "flac" => "audio/flac",
327        _ => "application/octet-stream",
328    }
329}
330
331fn upload_filename(path: &Path, content_type: &str) -> String {
332    let original = path
333        .file_name()
334        .and_then(|name| name.to_str())
335        .unwrap_or("codex")
336        .to_string();
337    if infer_content_type(path) != "application/octet-stream" {
338        return original;
339    }
340    let Some(extension) = extension_for_content_type(content_type) else {
341        return original;
342    };
343    let stem = path
344        .file_stem()
345        .and_then(|stem| stem.to_str())
346        .filter(|stem| !stem.is_empty() && !stem.starts_with('.'))
347        .unwrap_or("codex");
348    format!("{stem}.{extension}")
349}
350
351fn extension_for_content_type(content_type: &str) -> Option<&'static str> {
352    match content_type
353        .split(';')
354        .next()
355        .unwrap_or("")
356        .trim()
357        .to_ascii_lowercase()
358        .as_str()
359    {
360        "audio/wav" | "audio/x-wav" | "audio/wave" => Some("wav"),
361        "audio/mpeg" | "audio/mp3" => Some("mp3"),
362        "audio/mp4" | "audio/m4a" | "audio/x-m4a" => Some("m4a"),
363        "audio/flac" | "audio/x-flac" => Some("flac"),
364        "audio/ogg" => Some("ogg"),
365        "audio/webm" => Some("webm"),
366        _ => None,
367    }
368}
369
370pub fn default_auth_file() -> PathBuf {
371    let codex_home = env::var_os("CODEX_HOME")
372        .map(PathBuf::from)
373        .unwrap_or_else(|| home_dir().join(".codex"));
374    codex_home.join("auth.json")
375}
376
377pub fn resolve_proxy(explicit_proxy: Option<&str>) -> Option<String> {
378    first_non_empty([
379        explicit_proxy.map(str::to_string),
380        env::var("CODEX_ASR_PROXY").ok(),
381        env::var("CODEX_VOICE_PROXY").ok(),
382        env::var("HTTPS_PROXY").ok(),
383        env::var("https_proxy").ok(),
384        env::var("ALL_PROXY").ok(),
385        env::var("all_proxy").ok(),
386        system_https_proxy(),
387    ])
388}
389
390fn first_non_empty(values: impl IntoIterator<Item = Option<String>>) -> Option<String> {
391    values
392        .into_iter()
393        .flatten()
394        .map(|value| value.trim().to_string())
395        .find(|value| !value.is_empty())
396}
397
398fn system_https_proxy() -> Option<String> {
399    if cfg!(target_os = "macos") {
400        let output = Command::new("scutil").arg("--proxy").output().ok()?;
401        if !output.status.success() {
402            return None;
403        }
404        return parse_scutil_https_proxy(&String::from_utf8_lossy(&output.stdout));
405    }
406    None
407}
408
409fn parse_scutil_https_proxy(output: &str) -> Option<String> {
410    let mut enabled = false;
411    let mut host = None;
412    let mut port = None;
413    for line in output.lines() {
414        let Some((key, value)) = line.split_once(':') else {
415            continue;
416        };
417        match key.trim() {
418            "HTTPSEnable" => enabled = value.trim() == "1",
419            "HTTPSProxy" => host = Some(value.trim().to_string()),
420            "HTTPSPort" => port = Some(value.trim().to_string()),
421            _ => {}
422        }
423    }
424    if enabled {
425        Some(format!("http://{}:{}", host?, port?))
426    } else {
427        None
428    }
429}
430
431fn strip_bearer_prefix(token: &str) -> Option<String> {
432    let trimmed = token.trim();
433    let token = trimmed
434        .strip_prefix("Bearer ")
435        .or_else(|| trimmed.strip_prefix("bearer "))
436        .unwrap_or(trimmed)
437        .trim();
438    (!token.is_empty()).then(|| token.to_string())
439}
440
441fn account_id_from_access_token(access_token: &str) -> Option<String> {
442    let payload = access_token.split('.').nth(1)?;
443    let decoded = BASE64_URL_SAFE_NO_PAD.decode(payload).ok()?;
444    let value: Value = serde_json::from_slice(&decoded).ok()?;
445    value
446        .get("https://api.openai.com/auth")?
447        .get("chatgpt_account_id")?
448        .as_str()
449        .filter(|value| !value.is_empty())
450        .map(ToOwned::to_owned)
451}
452
453fn detect_codex_desktop_version() -> Option<String> {
454    let plist = fs::read_to_string("/Applications/Codex.app/Contents/Info.plist").ok()?;
455    let marker = "<key>CFBundleShortVersionString</key>";
456    let rest = plist.split_once(marker)?.1;
457    let start = rest.find("<string>")? + "<string>".len();
458    let end = rest[start..].find("</string>")?;
459    Some(rest[start..start + end].to_string())
460}
461
462fn home_dir() -> PathBuf {
463    env::var_os("HOME")
464        .map(PathBuf::from)
465        .unwrap_or_else(|| PathBuf::from("."))
466}
467
468fn clip_response_body(body: &str) -> String {
469    let mut clipped = body.split_whitespace().collect::<Vec<_>>().join(" ");
470    clipped.truncate(300);
471    clipped
472}
473
474#[cfg(test)]
475mod tests {
476    use super::*;
477
478    #[test]
479    fn content_type_is_inferred_from_extension() {
480        assert_eq!(infer_content_type("voice.wav"), "audio/wav");
481        assert_eq!(infer_content_type("voice.webm"), "audio/webm");
482        assert_eq!(
483            infer_content_type("voice.unknown"),
484            "application/octet-stream"
485        );
486    }
487
488    #[test]
489    fn upload_filename_uses_content_type_when_extension_is_unknown() {
490        assert_eq!(
491            upload_filename(Path::new("voice"), "audio/wav"),
492            "voice.wav"
493        );
494        assert_eq!(
495            upload_filename(Path::new("voice.bin"), "audio/webm"),
496            "voice.webm"
497        );
498        assert_eq!(
499            upload_filename(Path::new("voice.wav"), "audio/webm"),
500            "voice.wav"
501        );
502        assert_eq!(
503            upload_filename(Path::new("voice.bin"), "application/octet-stream"),
504            "voice.bin"
505        );
506    }
507
508    #[test]
509    fn bearer_prefix_is_optional() {
510        assert_eq!(
511            strip_bearer_prefix("Bearer abc.def").as_deref(),
512            Some("abc.def")
513        );
514        assert_eq!(strip_bearer_prefix("abc.def").as_deref(), Some("abc.def"));
515        assert_eq!(strip_bearer_prefix("  ").as_deref(), None);
516    }
517
518    #[test]
519    fn account_id_can_be_read_from_chatgpt_jwt_payload() {
520        let payload = BASE64_URL_SAFE_NO_PAD
521            .encode(r#"{"https://api.openai.com/auth":{"chatgpt_account_id":"acct_123"}}"#);
522        let token = format!("header.{payload}.sig");
523        assert_eq!(
524            account_id_from_access_token(&token).as_deref(),
525            Some("acct_123")
526        );
527    }
528
529    #[test]
530    fn macos_https_proxy_is_parsed() {
531        let output = r#"
532<dictionary> {
533  HTTPSEnable : 1
534  HTTPSProxy : 127.0.0.1
535  HTTPSPort : 7892
536}
537"#;
538        assert_eq!(
539            parse_scutil_https_proxy(output).as_deref(),
540            Some("http://127.0.0.1:7892")
541        );
542    }
543}