llmweb/lib.rs
1//! # llmweb.rs
2//! **Powering the Web with Rust & LLMs**
3//!
4//! `llmweb` is a Rust library designed to seamlessly integrate Large Language Models (LLMs)
5//! with web content. It allows you to fetch a webpage, extract its content, and then
6//! use an LLM to get structured data from it based on a provided schema.
7//!
8//! ## Features
9//! - 🚀 Seamless integration with major LLM APIs.
10//! - ✨ Automatic structured data extraction from web content.
11//! - 🔧 Schema-first approach for precise data formatting using `serde_json::Value`.
12//! - âš¡ Async-first design for high performance.
13//!
14//! ## Example
15//!
16//! Here's a quick example of how to use `llmweb` to extract stories from Hacker News:
17//!
18//! ```rust,no_run
19//! use llmweb::{LlmWeb, error::LlmWebError};
20//! use serde::{Deserialize, Serialize};
21//! use serde_json::json;
22//!
23//! #[derive(Debug, Serialize, Deserialize)]
24//! struct Story {
25//! title: String,
26//! points: f32,
27//! by: Option<String>,
28//! comments_url: Option<String>,
29//! }
30//!
31//! #[tokio::main]
32//! async fn main() -> Result<(), LlmWebError> {
33//! // 1. Define the schema for the data you want to extract.
34//! let schema_json = json!({
35//! "type": "array",
36//! "items": {
37//! "type": "object",
38//! "properties": {
39//! "by": { "type": "string" },
40//! "comments_url": { "type": "string" },
41//! "points": { "type": "number" },
42//! "title": { "type": "string" }
43//! },
44//! "required": ["by", "comments_url", "points", "title"]
45//! }
46//! });
47//!
48//! // 2. Create an LlmWeb instance with the desired model.
49//! // Make sure you have the GEMINI_API_KEY environment variable set.
50//! let llmweb = LlmWeb::new("gemini-1.5-flash");
51//!
52//! // 3. Call completion with the URL and schema.
53//! let structured_value: Vec<Story> = llmweb
54//! .completion("https://news.ycombinator.com", schema_json)
55//! .await?;
56//!
57//! // 4. Print the result.
58//! println!("{:#?}", structured_value);
59//!
60//! Ok(())
61//! }
62//! ```
63use {
64 crate::{browser::LlmWebBrower, error::Result},
65 serde::de::DeserializeOwned,
66 std::fmt::Debug,
67};
68
69mod browser;
70pub mod error;
71mod models;
72
73/// Represents the desired output format.
74///
75/// Note: This is currently not used but is planned for future versions.
76#[derive(Debug, Clone)]
77pub enum LlmWebFormat {
78 /// JSON format.
79 Json,
80 /// YAML format.
81 Yaml,
82 /// Plain text format.
83 Text,
84}
85
86/// The main struct for interacting with web pages and LLMs.
87///
88/// It holds the client for the LLM and provides methods to
89/// perform completions on web content.
90pub struct LlmWeb {
91 client: models::LLMClient,
92}
93
94impl LlmWeb {
95 /// Creates a new `LlmWeb` instance.
96 ///
97 /// # Arguments
98 ///
99 /// * `name` - The name of the LLM model to use (e.g., "gemini-1.5-flash").
100 pub fn new(name: &str) -> Self {
101 Self {
102 client: models::LLMClient::new(name),
103 }
104 }
105
106 /// Fetches content from a URL, sends it to an LLM for processing based on a schema,
107 /// and returns the structured data.
108 ///
109 /// This function performs the following steps:
110 /// 1. Launches a headless browser.
111 /// 2. Navigates to the specified URL.
112 /// 3. Extracts the HTML content of the page.
113 /// 4. Sends the content and a JSON schema to the configured LLM.
114 /// 5. Parses the LLM's JSON response into the specified Rust type `R`.
115 ///
116 /// # Arguments
117 ///
118 /// * `url` - The URL of the web page to process.
119 /// * `scheme` - A serializable object representing the JSON schema for data extraction.
120 /// This is typically a `serde_json::Value`.
121 ///
122 /// # Errors
123 ///
124 /// This function can return an `LlmWebError` if any of the steps fail, such as
125 /// browser errors, network issues, LLM API errors, or JSON deserialization errors.
126 pub async fn exec<R>(&self, url: &str, scheme: serde_json::Value) -> Result<R>
127 where
128 R: DeserializeOwned + Debug,
129 {
130 let browser = LlmWebBrower::new().await?;
131 let html = browser.run(url).await?;
132 let response = self.client.completion(&html, scheme).await?;
133
134 // The `?` operator is used here thanks to `#[from] serde_json::Error` on LlmWebError.
135 let result: R = serde_json::from_str(&response)?;
136
137 Ok(result)
138 }
139
140 /// A convenience method that accepts a schema as a string slice.
141 ///
142 /// This method is useful when loading a schema from a file. It parses the
143 /// string into a `serde_json::Value` and then calls the main `completion` method.
144 ///
145 /// # Arguments
146 ///
147 /// * `url` - The URL of the web page to process.
148 /// * `schema_str` - A string slice containing the JSON schema.
149 ///
150 /// # Errors
151 ///
152 /// Returns an error if the `schema_str` is not valid JSON, or if any of the
153 /// underlying operations in `completion` fail.
154 pub async fn exec_from_schema_str<R>(&self, url: &str, schema_str: &str) -> Result<R>
155 where
156 R: DeserializeOwned + Debug,
157 {
158 let scheme: serde_json::Value = serde_json::from_str(schema_str)?;
159 self.exec(url, scheme).await
160 }
161
162 /// Fetches content from a URL, sends it to an LLM for processing based on a schema,
163 /// and returns the structured data.
164 ///
165 /// This function performs the following steps:
166 /// 1. Launches a headless browser.
167 /// 2. Navigates to the specified URL.
168 /// 3. Extracts the HTML content of the page.
169 /// 4. Sends the content and a JSON schema to the configured LLM.
170 /// 5. Parses the LLM's JSON response into the specified Rust type `R`.
171 ///
172 /// This method is intended for streaming responses.
173 ///
174 /// # Arguments
175 ///
176 /// * `url` - The URL of the web page to process.
177 /// * `scheme` - A serializable object representing the JSON schema for data extraction.
178 /// This is typically a `serde_json::Value`.
179 ///
180 /// # Errors
181 ///
182 /// This function can return an `LlmWebError` if any of the steps fail, such as
183 /// browser errors, network issues, LLM API errors, or JSON deserialization errors.
184 pub async fn stream<R>(&self, url: &str, scheme: serde_json::Value) -> Result<R>
185 where
186 R: DeserializeOwned + Debug,
187 {
188 let browser = LlmWebBrower::new().await?;
189 let html = browser.run(url).await?;
190 let response = self.client.completion_stream(&html, scheme).await?;
191
192 let result: R = serde_json::from_str(&response)?;
193
194 Ok(result)
195 }
196}