omnivore_core/parser/
mod.rs1pub mod extractors;
2pub mod html;
3pub mod schema;
4
5use crate::{Error, Result};
6use scraper::{Html, Selector};
7use serde::{Deserialize, Serialize};
8use serde_json::Value;
9use std::collections::HashMap;
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ParseRule {
13 pub name: String,
14 pub selector: String,
15 pub attribute: Option<String>,
16 pub multiple: bool,
17 pub required: bool,
18 pub transform: Option<String>,
19}
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct ParseConfig {
23 pub rules: Vec<ParseRule>,
24 pub schema_name: Option<String>,
25 pub clean_text: bool,
26 pub extract_metadata: bool,
27}
28
29pub struct Parser {
30 config: ParseConfig,
31}
32
33impl Parser {
34 pub fn new(config: ParseConfig) -> Self {
35 Self { config }
36 }
37
38 pub fn parse(&self, html: &str) -> Result<Value> {
39 let document = Html::parse_document(html);
40 let mut result = serde_json::Map::new();
41
42 for rule in &self.config.rules {
43 let value = self.extract_by_rule(&document, rule)?;
44
45 if rule.required && value.is_null() {
46 return Err(Error::Parse(format!(
47 "Required field '{}' not found",
48 rule.name
49 )));
50 }
51
52 result.insert(rule.name.clone(), value);
53 }
54
55 if self.config.extract_metadata {
56 let metadata = self.extract_metadata(&document)?;
57 result.insert("_metadata".to_string(), metadata);
58 }
59
60 Ok(Value::Object(result))
61 }
62
63 fn extract_by_rule(&self, document: &Html, rule: &ParseRule) -> Result<Value> {
64 let selector = Selector::parse(&rule.selector)
65 .map_err(|e| Error::Parse(format!("Invalid selector '{}': {:?}", rule.selector, e)))?;
66
67 let elements: Vec<_> = document.select(&selector).collect();
68
69 if elements.is_empty() {
70 return Ok(Value::Null);
71 }
72
73 if rule.multiple {
74 let values: Vec<Value> = elements
75 .iter()
76 .map(|el| self.extract_value(el, &rule.attribute))
77 .collect();
78 Ok(Value::Array(values))
79 } else {
80 Ok(self.extract_value(&elements[0], &rule.attribute))
81 }
82 }
83
84 fn extract_value(&self, element: &scraper::ElementRef, attribute: &Option<String>) -> Value {
85 let text = if let Some(attr) = attribute {
86 element
87 .value()
88 .attr(attr)
89 .map(|s| s.to_string())
90 .unwrap_or_default()
91 } else {
92 element.text().collect::<String>()
93 };
94
95 let cleaned = if self.config.clean_text {
96 self.clean_text(&text)
97 } else {
98 text
99 };
100
101 Value::String(cleaned)
102 }
103
104 fn clean_text(&self, text: &str) -> String {
105 text.split_whitespace()
106 .filter(|s| !s.is_empty())
107 .collect::<Vec<_>>()
108 .join(" ")
109 }
110
111 fn extract_metadata(&self, document: &Html) -> Result<Value> {
112 let mut metadata = serde_json::Map::new();
113
114 let title_selector = Selector::parse("title").unwrap();
115 if let Some(title) = document.select(&title_selector).next() {
116 metadata.insert(
117 "title".to_string(),
118 Value::String(title.text().collect::<String>()),
119 );
120 }
121
122 let meta_selector = Selector::parse("meta[name], meta[property]").unwrap();
123 let mut meta_tags = HashMap::new();
124
125 for element in document.select(&meta_selector) {
126 let name = element
127 .value()
128 .attr("name")
129 .or_else(|| element.value().attr("property"));
130 let content = element.value().attr("content");
131
132 if let (Some(n), Some(c)) = (name, content) {
133 meta_tags.insert(n.to_string(), c.to_string());
134 }
135 }
136
137 if !meta_tags.is_empty() {
138 metadata.insert("meta_tags".to_string(), serde_json::to_value(meta_tags)?);
139 }
140
141 Ok(Value::Object(metadata))
142 }
143
144 pub fn extract_text(&self, html: &str) -> String {
145 let document = Html::parse_document(html);
146 let body_selector = Selector::parse("body").unwrap();
147
148 document
149 .select(&body_selector)
150 .next()
151 .map(|body| body.text().collect::<String>())
152 .unwrap_or_default()
153 }
154
155 pub fn extract_links(&self, html: &str, base_url: &url::Url) -> Result<Vec<url::Url>> {
156 let document = Html::parse_document(html);
157 let link_selector = Selector::parse("a[href]").unwrap();
158 let mut links = Vec::new();
159
160 for element in document.select(&link_selector) {
161 if let Some(href) = element.value().attr("href") {
162 if let Ok(url) = base_url.join(href) {
163 links.push(url);
164 }
165 }
166 }
167
168 Ok(links)
169 }
170}