llmweb/
lib.rs

1//! # llmweb.rs
2//! **Powering the Web with Rust & LLMs**
3//!
4//! `llmweb` is a Rust library designed to seamlessly integrate Large Language Models (LLMs)
5//! with web content. It allows you to fetch a webpage, extract its content, and then
6//! use an LLM to get structured data from it based on a provided schema.
7//!
8//! ## Features
9//! - 🚀 Seamless integration with major LLM APIs.
10//! - ✨ Automatic structured data extraction from web content.
11//! - 🔧 Schema-first approach for precise data formatting using `serde_json::Value`.
12//! - âš¡ Async-first design for high performance.
13//!
14//! ## Example
15//!
16//! Here's a quick example of how to use `llmweb` to extract stories from Hacker News:
17//!
18//! ```rust,no_run
19//! use llmweb::{LlmWeb, error::LlmWebError};
20//! use serde::{Deserialize, Serialize};
21//! use serde_json::json;
22//!
23//! #[derive(Debug, Serialize, Deserialize)]
24//! struct Story {
25//!     title: String,
26//!     points: f32,
27//!     by: Option<String>,
28//!     comments_url: Option<String>,
29//! }
30//!
31//! #[tokio::main]
32//! async fn main() -> Result<(), LlmWebError> {
33//!     // 1. Define the schema for the data you want to extract.
34//!     let schema_json = json!({
35//!         "type": "array",
36//!         "items": {
37//!             "type": "object",
38//!             "properties": {
39//!                 "by": { "type": "string" },
40//!                 "comments_url": { "type": "string" },
41//!                 "points": { "type": "number" },
42//!                 "title": { "type": "string" }
43//!             },
44//!             "required": ["by", "comments_url", "points", "title"]
45//!         }
46//!     });
47//!
48//!     // 2. Create an LlmWeb instance with the desired model.
49//!     //    Make sure you have the GEMINI_API_KEY environment variable set.
50//!     let llmweb = LlmWeb::new("gemini-1.5-flash");
51//!
52//!     // 3. Call completion with the URL and schema.
53//!     let structured_value: Vec<Story> = llmweb
54//!         .completion("https://news.ycombinator.com", schema_json)
55//!         .await?;
56//!
57//!     // 4. Print the result.
58//!     println!("{:#?}", structured_value);
59//!
60//!     Ok(())
61//! }
62//! ```
63use {
64    crate::{browser::LlmWebBrower, error::Result},
65    serde::de::DeserializeOwned,
66    std::fmt::Debug,
67};
68
69mod browser;
70pub mod error;
71mod models;
72
73/// Represents the desired output format.
74///
75/// Note: This is currently not used but is planned for future versions.
76#[derive(Debug, Clone)]
77pub enum LlmWebFormat {
78    /// JSON format.
79    Json,
80    /// YAML format.
81    Yaml,
82    /// Plain text format.
83    Text,
84}
85
86/// The main struct for interacting with web pages and LLMs.
87///
88/// It holds the client for the LLM and provides methods to
89/// perform completions on web content.
90pub struct LlmWeb {
91    client: models::LLMClient,
92}
93
94impl LlmWeb {
95    /// Creates a new `LlmWeb` instance.
96    ///
97    /// # Arguments
98    ///
99    /// * `name` - The name of the LLM model to use (e.g., "gemini-1.5-flash").
100    pub fn new(name: &str) -> Self {
101        Self {
102            client: models::LLMClient::new(name),
103        }
104    }
105
106    /// Fetches content from a URL, sends it to an LLM for processing based on a schema,
107    /// and returns the structured data.
108    ///
109    /// This function performs the following steps:
110    /// 1. Launches a headless browser.
111    /// 2. Navigates to the specified URL.
112    /// 3. Extracts the HTML content of the page.
113    /// 4. Sends the content and a JSON schema to the configured LLM.
114    /// 5. Parses the LLM's JSON response into the specified Rust type `R`.
115    ///
116    /// # Arguments
117    ///
118    /// * `url` - The URL of the web page to process.
119    /// * `scheme` - A serializable object representing the JSON schema for data extraction.
120    ///   This is typically a `serde_json::Value`.
121    ///
122    /// # Errors
123    ///
124    /// This function can return an `LlmWebError` if any of the steps fail, such as
125    /// browser errors, network issues, LLM API errors, or JSON deserialization errors.
126    pub async fn exec<R>(&self, url: &str, scheme: serde_json::Value) -> Result<R>
127    where
128        R: DeserializeOwned + Debug,
129    {
130        let browser = LlmWebBrower::new().await?;
131        let html = browser.run(url).await?;
132        let response = self.client.completion(&html, scheme).await?;
133
134        // The `?` operator is used here thanks to `#[from] serde_json::Error` on LlmWebError.
135        let result: R = serde_json::from_str(&response)?;
136
137        Ok(result)
138    }
139
140    /// A convenience method that accepts a schema as a string slice.
141    ///
142    /// This method is useful when loading a schema from a file. It parses the
143    /// string into a `serde_json::Value` and then calls the main `completion` method.
144    ///
145    /// # Arguments
146    ///
147    /// * `url` - The URL of the web page to process.
148    /// * `schema_str` - A string slice containing the JSON schema.
149    ///
150    /// # Errors
151    ///
152    /// Returns an error if the `schema_str` is not valid JSON, or if any of the
153    /// underlying operations in `completion` fail.
154    pub async fn exec_from_schema_str<R>(&self, url: &str, schema_str: &str) -> Result<R>
155    where
156        R: DeserializeOwned + Debug,
157    {
158        let scheme: serde_json::Value = serde_json::from_str(schema_str)?;
159        self.exec(url, scheme).await
160    }
161
162    /// Fetches content from a URL, sends it to an LLM for processing based on a schema,
163    /// and returns the structured data.
164    ///
165    /// This function performs the following steps:
166    /// 1. Launches a headless browser.
167    /// 2. Navigates to the specified URL.
168    /// 3. Extracts the HTML content of the page.
169    /// 4. Sends the content and a JSON schema to the configured LLM.
170    /// 5. Parses the LLM's JSON response into the specified Rust type `R`.
171    ///
172    /// This method is intended for streaming responses.
173    ///
174    /// # Arguments
175    ///
176    /// * `url` - The URL of the web page to process.
177    /// * `scheme` - A serializable object representing the JSON schema for data extraction.
178    ///   This is typically a `serde_json::Value`.
179    ///
180    /// # Errors
181    ///
182    /// This function can return an `LlmWebError` if any of the steps fail, such as
183    /// browser errors, network issues, LLM API errors, or JSON deserialization errors.
184    pub async fn stream<R>(&self, url: &str, scheme: serde_json::Value) -> Result<R>
185    where
186        R: DeserializeOwned + Debug,
187    {
188        let browser = LlmWebBrower::new().await?;
189        let html = browser.run(url).await?;
190        let response = self.client.completion_stream(&html, scheme).await?;
191
192        let result: R = serde_json::from_str(&response)?;
193
194        Ok(result)
195    }
196}