use url_preview::{
LLMExtractor, OpenAIProvider, Fetcher,
PreviewError, ContentFormat, LLMExtractorConfig
};
use serde::{Deserialize, Serialize};
use schemars::JsonSchema;
use std::sync::Arc;
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
struct WebPageAnalysis {
title: String,
description: String,
content_type: String, main_topics: Vec<String>,
target_audience: String,
key_information: Vec<KeyInfo>,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
struct KeyInfo {
category: String,
details: String,
}
#[tokio::main]
async fn main() -> Result<(), PreviewError> {
println!("🚀 OpenAI API 结构化数据提取测试\n");
let api_key = std::env::var("OPENAI_API_KEY")
.expect("请设置 OPENAI_API_KEY 环境变量");
let provider = Arc::new(OpenAIProvider::new(api_key));
let config = LLMExtractorConfig {
format: ContentFormat::Markdown, clean_html: true,
max_content_length: 15_000,
model_params: Default::default(),
};
let extractor = LLMExtractor::with_config(provider, config);
let fetcher = Fetcher::new();
let test_urls = vec![
("https://www.rust-lang.org/", "Rust 官网"),
("https://github.com/tokio-rs/tokio", "Tokio 项目"),
("https://docs.rs/", "docs.rs 文档站"),
];
for (url, name) in test_urls {
println!("\n📄 测试: {}", name);
println!("URL: {}", url);
println!("{}", "=".repeat(50));
match extractor.extract::<WebPageAnalysis>(url, &fetcher).await {
Ok(result) => {
println!("✅ 成功提取!\n");
println!("标题: {}", result.data.title);
println!("描述: {}", result.data.description);
println!("内容类型: {}", result.data.content_type);
println!("目标受众: {}", result.data.target_audience);
println!("\n主要主题:");
for topic in &result.data.main_topics {
println!(" • {}", topic);
}
println!("\n关键信息:");
for info in &result.data.key_information {
println!(" • [{}] {}", info.category, info.details);
}
if let Some(usage) = result.usage {
println!("\nToken 使用: {}", usage.total_tokens);
}
}
Err(e) => println!("❌ 错误: {}", e),
}
println!();
}
println!("\n🛍️ 测试电商产品提取");
println!("{}", "=".repeat(50));
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
struct EcommerceProduct {
product_name: String,
brand: Option<String>,
price: Option<String>,
currency: Option<String>,
availability: String,
rating: Option<f32>,
review_count: Option<i32>,
description: String,
features: Vec<String>,
specifications: Vec<Specification>,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
struct Specification {
name: String,
value: String,
}
println!("\n✨ 测试完成!");
println!("\n💡 提示:");
println!("1. OpenAI 的 function calling 功能非常适合结构化提取");
println!("2. 使用 gpt-4 模型可以获得更好的提取质量");
println!("3. 可以通过调整 schema 来控制提取的详细程度");
Ok(())
}