parse_book_source/
backend.rs1use super::error::EvalError;
9use super::source::{Extract, ExtractOp, Via};
10use dom_query::{Document, Matcher};
11use fancy_regex::Regex;
12use jsonpath_rust::JsonPath;
13use serde_json::Value;
14use std::sync::LazyLock;
15
16pub fn extract(
18 via: Via,
19 content: &str,
20 select: Option<&str>,
21 index: Option<i64>,
22 ex: &Extract,
23) -> Result<String, EvalError> {
24 match via {
25 Via::Css => html_extract(content, select, index, ex),
26 Via::Json => json_extract(content, select, index, ex),
27 Via::Regex => regex_extract(content, select, index),
28 Via::Raw => Ok(content.to_string()),
29 Via::Xpath => Err(EvalError::Unsupported("xpath")),
30 }
31}
32
33pub fn select_all(via: Via, content: &str, select: &str) -> Result<Vec<String>, EvalError> {
35 match via {
36 Via::Css => {
37 let doc = Document::from(content.to_string());
38 let matcher =
39 Matcher::new(select).map_err(|_| EvalError::Selector(select.to_string()))?;
40 let sel = doc.select_matcher(&matcher);
41 Ok(sel.nodes().iter().map(|n| n.html().to_string()).collect())
42 }
43 Via::Json => {
44 let value: Value =
45 serde_json::from_str(content).map_err(|e| EvalError::Json(e.to_string()))?;
46 let matched = value
47 .query(select)
48 .map_err(|e| EvalError::JsonPath(e.to_string()))?;
49 Ok(matched.into_iter().map(value_to_string).collect())
51 }
52 Via::Regex => {
53 let re = Regex::new(select).map_err(|e| EvalError::Regex(e.to_string()))?;
54 Ok(re
55 .find_iter(content)
56 .filter_map(|m| m.ok())
57 .map(|m| m.as_str().to_string())
58 .collect())
59 }
60 Via::Raw => Ok(vec![content.to_string()]),
61 Via::Xpath => Err(EvalError::Unsupported("xpath")),
62 }
63}
64
65fn html_extract(
68 content: &str,
69 select: Option<&str>,
70 index: Option<i64>,
71 ex: &Extract,
72) -> Result<String, EvalError> {
73 let doc = Document::from(content.to_string());
74 let sel = match select {
77 Some(s) => {
78 let matcher = Matcher::new(s).map_err(|_| EvalError::Selector(s.to_string()))?;
79 doc.select_matcher(&matcher)
80 }
81 None => doc.select(":root"),
82 };
83 let nodes = sel.nodes();
84 if nodes.is_empty() {
85 return Ok(String::new());
86 }
87 let node = &nodes[resolve_index(index, nodes.len())];
88 Ok(match ex {
89 Extract::Op(ExtractOp::Text) => node.text().trim().to_string(),
91 Extract::Op(ExtractOp::OwnText) => node.immediate_text().trim().to_string(),
92 Extract::Op(ExtractOp::Html) => clean_html(&node.inner_html()),
94 Extract::Op(ExtractOp::InnerHtml) => node.inner_html().to_string(),
95 Extract::Op(ExtractOp::OuterHtml) => node.html().to_string(),
96 Extract::Attr { attr } => node
97 .attr(attr)
98 .map(|s| s.trim().to_string())
99 .unwrap_or_default(),
100 })
101}
102
103fn clean_html(html: &str) -> String {
106 static TAGS: LazyLock<Regex> = LazyLock::new(|| {
108 Regex::new(r"</?(?:div|p|br|hr|h[1-6]|article|section|dd|dl|li)[^>]*>").unwrap()
109 });
110 static COMMENTS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--[\s\S]*?-->").unwrap());
111 static OTHER_TAGS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap());
112
113 let s = TAGS.replace_all(html, "\n");
114 let s = COMMENTS.replace_all(&s, "");
115 let s = OTHER_TAGS.replace_all(&s, "");
116 decode_entities(&s)
117}
118
119fn decode_entities(s: &str) -> String {
120 s.replace("&", "&")
121 .replace("<", "<")
122 .replace(">", ">")
123 .replace(" ", " ")
124 .replace("'", "'")
125 .replace(""", "\"")
126}
127
128fn json_extract(
131 content: &str,
132 select: Option<&str>,
133 index: Option<i64>,
134 ex: &Extract,
135) -> Result<String, EvalError> {
136 let value: Value = serde_json::from_str(content).map_err(|e| EvalError::Json(e.to_string()))?;
137 let path = select.unwrap_or("$");
138 let matched = value
139 .query(path)
140 .map_err(|e| EvalError::JsonPath(e.to_string()))?;
141 if matched.is_empty() {
142 return Ok(String::new());
143 }
144 let v = matched[resolve_index(index, matched.len())];
145 let _ = ex;
147 Ok(value_to_string(v))
148}
149
150fn value_to_string(v: &Value) -> String {
151 match v {
152 Value::String(s) => s.clone(),
153 Value::Null => String::new(),
154 other => other.to_string(),
155 }
156}
157
158fn regex_extract(
161 content: &str,
162 select: Option<&str>,
163 index: Option<i64>,
164) -> Result<String, EvalError> {
165 let pat = select.unwrap_or("");
166 let re = Regex::new(pat).map_err(|e| EvalError::Regex(e.to_string()))?;
167 let caps: Vec<String> = re
168 .captures_iter(content)
169 .filter_map(|c| c.ok())
170 .map(|c| {
171 c.get(1)
173 .or_else(|| c.get(0))
174 .map(|m| m.as_str().to_string())
175 .unwrap_or_default()
176 })
177 .collect();
178 if caps.is_empty() {
179 return Ok(String::new());
180 }
181 Ok(caps[resolve_index(index, caps.len())].clone())
182}
183
184fn resolve_index(index: Option<i64>, len: usize) -> usize {
188 match index {
189 None => 0,
190 Some(i) if i >= 0 => (i as usize).min(len - 1),
191 Some(i) => {
192 let from_end = (-i) as usize;
193 len.saturating_sub(from_end)
194 }
195 }
196}