enzymeml/
llm.rs

1//! LLM integration module for EnzymeML document generation
2//!
3//! This module provides functionality to interact with Large Language Models (LLMs)
4//! to generate EnzymeML documents from natural language descriptions. It currently
5//! supports OpenAI's GPT models through their API.
6//!
7//! # Key Components
8//!
9//! - `query_llm`: Main function to query the LLM and generate EnzymeML documents
10//! - `PromptInput`: Enum for handling different types of input prompts
11//! - `LLMError`: Error types specific to LLM operations
12use std::path::PathBuf;
13
14use mdmodels::{llm::extraction::query_openai, prelude::DataModel};
15use serde_json::Value;
16use thiserror::Error;
17
18// Include the EnzymeML specification from markdown file
19const SPECS: &str = include_str!("../specs/specifications/v2.md");
20
21/// Default system prompt that instructs the LLM on its role and expected output format
22const DEFAULT_SYSTEM_PROMPT: &str =
23    "You are a helpful scientific assistant that is capable to identify scientific facts and data from a given text. You are also capable of extracting information from a given text and returning it in a structured format. Please return the information in a JSON format. Think step by step and work precisely.";
24
25/// Query an LLM to generate an EnzymeML document from a natural language description
26///
27/// # Arguments
28///
29/// * `prompt` - The main input prompt describing the enzyme kinetics experiment
30/// * `system_prompt` - Optional custom system prompt to override the default
31/// * `llm_model` - Optional specific LLM model to use (defaults to "gpt-4o")
32/// * `api_key` - Optional API key for the LLM service (falls back to OPENAI_API_KEY env var)
33///
34/// # Returns
35///
36/// Returns a Result containing either the generated EnzymeML document or an error
37///
38/// # Errors
39///
40/// Can return various errors including:
41/// - Environment variable not found
42/// - File reading errors
43/// - LLM service errors
44/// - Data model parsing errors
45/// - JSON serialization errors
46pub fn query_llm(
47    prompt: impl Into<PromptInput>,
48    system_prompt: Option<impl Into<PromptInput>>,
49    llm_model: Option<String>,
50    api_key: Option<String>,
51) -> Result<Value, LLMError> {
52    let llm_model = llm_model.unwrap_or_else(|| "gpt-4o".to_string());
53    let api_key = match api_key {
54        Some(key) => key,
55        None => std::env::var("OPENAI_API_KEY").map_err(LLMError::EnvError)?,
56    };
57
58    let prompt: String = prompt.into().try_into()?;
59    let system: String = if let Some(system_prompt) = system_prompt {
60        system_prompt.into().try_into()?
61    } else {
62        DEFAULT_SYSTEM_PROMPT.to_string()
63    };
64
65    let model = DataModel::from_markdown_string(SPECS)
66        .map_err(|e| LLMError::DataModelError(e.to_string()))?;
67
68    tokio::runtime::Runtime::new()
69        .unwrap()
70        .block_on(query_openai(
71            prompt.as_str(),
72            system.as_str(),
73            &model,
74            "EnzymeMLDocument",
75            &llm_model,
76            false,
77            Some(api_key),
78        ))
79        .map_err(LLMError::LLMServiceError)
80}
81
82/// Represents different types of input prompts that can be provided to the LLM
83///
84/// This enum allows for flexible input handling, accepting either direct strings
85/// or file paths containing the prompt text.
86#[derive(Debug)]
87pub enum PromptInput {
88    /// Path to a file containing the prompt text
89    File(PathBuf),
90    /// Direct string containing the prompt text
91    String(String),
92}
93
94impl TryInto<String> for PromptInput {
95    type Error = LLMError;
96
97    fn try_into(self) -> Result<String, Self::Error> {
98        match self {
99            PromptInput::String(s) => Ok(s),
100            PromptInput::File(path) => {
101                Ok(std::fs::read_to_string(path).map_err(LLMError::FileError)?)
102            }
103        }
104    }
105}
106
107impl From<String> for PromptInput {
108    fn from(s: String) -> Self {
109        PromptInput::String(s)
110    }
111}
112
113impl From<&str> for PromptInput {
114    fn from(s: &str) -> Self {
115        PromptInput::String(s.to_string())
116    }
117}
118
119impl From<PathBuf> for PromptInput {
120    fn from(path: PathBuf) -> Self {
121        PromptInput::File(path)
122    }
123}
124
125/// Errors that can occur during LLM operations
126///
127/// This enum encompasses all possible error types that might occur when
128/// interacting with the LLM service and processing its responses.
129#[derive(Debug, Error)]
130pub enum LLMError {
131    /// Error when reading prompt from file
132    #[error("File not found: {0}")]
133    FileError(#[from] std::io::Error),
134    /// Error when accessing environment variables
135    #[error("Environment variable not found: {0}")]
136    EnvError(#[from] std::env::VarError),
137    /// Error from the LLM service itself
138    #[error("LLM service error: {0}")]
139    LLMServiceError(#[from] Box<dyn std::error::Error>),
140    /// Error in the underlying data model
141    #[error("LLM model error: {0}")]
142    DataModelError(String),
143    /// Error during JSON serialization/deserialization
144    #[error("Serde error: {0}")]
145    SerdeError(#[from] serde_json::Error),
146}