omnivore_core/parser/
mod.rs

1pub mod extractors;
2pub mod html;
3pub mod schema;
4
5use crate::{Error, Result};
6use scraper::{Html, Selector};
7use serde::{Deserialize, Serialize};
8use serde_json::Value;
9use std::collections::HashMap;
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ParseRule {
13    pub name: String,
14    pub selector: String,
15    pub attribute: Option<String>,
16    pub multiple: bool,
17    pub required: bool,
18    pub transform: Option<String>,
19}
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct ParseConfig {
23    pub rules: Vec<ParseRule>,
24    pub schema_name: Option<String>,
25    pub clean_text: bool,
26    pub extract_metadata: bool,
27}
28
29pub struct Parser {
30    config: ParseConfig,
31}
32
33impl Parser {
34    pub fn new(config: ParseConfig) -> Self {
35        Self { config }
36    }
37
38    pub fn parse(&self, html: &str) -> Result<Value> {
39        let document = Html::parse_document(html);
40        let mut result = serde_json::Map::new();
41
42        for rule in &self.config.rules {
43            let value = self.extract_by_rule(&document, rule)?;
44
45            if rule.required && value.is_null() {
46                return Err(Error::Parse(format!(
47                    "Required field '{}' not found",
48                    rule.name
49                )));
50            }
51
52            result.insert(rule.name.clone(), value);
53        }
54
55        if self.config.extract_metadata {
56            let metadata = self.extract_metadata(&document)?;
57            result.insert("_metadata".to_string(), metadata);
58        }
59
60        Ok(Value::Object(result))
61    }
62
63    fn extract_by_rule(&self, document: &Html, rule: &ParseRule) -> Result<Value> {
64        let selector = Selector::parse(&rule.selector)
65            .map_err(|e| Error::Parse(format!("Invalid selector '{}': {:?}", rule.selector, e)))?;
66
67        let elements: Vec<_> = document.select(&selector).collect();
68
69        if elements.is_empty() {
70            return Ok(Value::Null);
71        }
72
73        if rule.multiple {
74            let values: Vec<Value> = elements
75                .iter()
76                .map(|el| self.extract_value(el, &rule.attribute))
77                .collect();
78            Ok(Value::Array(values))
79        } else {
80            Ok(self.extract_value(&elements[0], &rule.attribute))
81        }
82    }
83
84    fn extract_value(&self, element: &scraper::ElementRef, attribute: &Option<String>) -> Value {
85        let text = if let Some(attr) = attribute {
86            element
87                .value()
88                .attr(attr)
89                .map(|s| s.to_string())
90                .unwrap_or_default()
91        } else {
92            element.text().collect::<String>()
93        };
94
95        let cleaned = if self.config.clean_text {
96            self.clean_text(&text)
97        } else {
98            text
99        };
100
101        Value::String(cleaned)
102    }
103
104    fn clean_text(&self, text: &str) -> String {
105        text.split_whitespace()
106            .filter(|s| !s.is_empty())
107            .collect::<Vec<_>>()
108            .join(" ")
109    }
110
111    fn extract_metadata(&self, document: &Html) -> Result<Value> {
112        let mut metadata = serde_json::Map::new();
113
114        let title_selector = Selector::parse("title").unwrap();
115        if let Some(title) = document.select(&title_selector).next() {
116            metadata.insert(
117                "title".to_string(),
118                Value::String(title.text().collect::<String>()),
119            );
120        }
121
122        let meta_selector = Selector::parse("meta[name], meta[property]").unwrap();
123        let mut meta_tags = HashMap::new();
124
125        for element in document.select(&meta_selector) {
126            let name = element
127                .value()
128                .attr("name")
129                .or_else(|| element.value().attr("property"));
130            let content = element.value().attr("content");
131
132            if let (Some(n), Some(c)) = (name, content) {
133                meta_tags.insert(n.to_string(), c.to_string());
134            }
135        }
136
137        if !meta_tags.is_empty() {
138            metadata.insert("meta_tags".to_string(), serde_json::to_value(meta_tags)?);
139        }
140
141        Ok(Value::Object(metadata))
142    }
143
144    pub fn extract_text(&self, html: &str) -> String {
145        let document = Html::parse_document(html);
146        let body_selector = Selector::parse("body").unwrap();
147
148        document
149            .select(&body_selector)
150            .next()
151            .map(|body| body.text().collect::<String>())
152            .unwrap_or_default()
153    }
154
155    pub fn extract_links(&self, html: &str, base_url: &url::Url) -> Result<Vec<url::Url>> {
156        let document = Html::parse_document(html);
157        let link_selector = Selector::parse("a[href]").unwrap();
158        let mut links = Vec::new();
159
160        for element in document.select(&link_selector) {
161            if let Some(href) = element.value().attr("href") {
162                if let Ok(url) = base_url.join(href) {
163                    links.push(url);
164                }
165            }
166        }
167
168        Ok(links)
169    }
170}