1pub mod chunking;
15pub mod clean;
16pub mod filter;
17pub mod markdown;
18#[cfg(feature = "pdf")]
19pub mod pdf;
20pub mod plaintext;
21pub mod readability;
22pub mod selector;
23pub mod structured;
24
25use crw_core::error::{CrwError, CrwResult};
26use crw_core::types::{
27 ChunkResult, ChunkStrategy, FilterMode, OutputFormat, PageMetadata, ScrapeData,
28};
29
30pub struct ExtractOptions<'a> {
32 pub raw_html: &'a str,
33 pub source_url: &'a str,
34 pub status_code: u16,
35 pub rendered_with: Option<String>,
36 pub elapsed_ms: u64,
37 pub formats: &'a [OutputFormat],
38 pub only_main_content: bool,
39 pub include_tags: &'a [String],
40 pub exclude_tags: &'a [String],
41 pub css_selector: Option<&'a str>,
43 pub xpath: Option<&'a str>,
45 pub chunk_strategy: Option<&'a ChunkStrategy>,
47 pub query: Option<&'a str>,
49 pub filter_mode: Option<&'a FilterMode>,
51 pub top_k: Option<usize>,
53}
54
55pub fn extract(opts: ExtractOptions<'_>) -> CrwResult<ScrapeData> {
57 let ExtractOptions {
58 raw_html,
59 source_url,
60 status_code,
61 rendered_with,
62 elapsed_ms,
63 formats,
64 only_main_content,
65 include_tags,
66 exclude_tags,
67 css_selector,
68 xpath,
69 chunk_strategy,
70 query,
71 filter_mode,
72 top_k,
73 } = opts;
74
75 let meta = readability::extract_metadata(raw_html);
77
78 let cleaned = clean::clean_html(raw_html, only_main_content, include_tags, exclude_tags)
80 .unwrap_or_else(|_| raw_html.to_string());
81
82 let selected_html = apply_selector(&cleaned, css_selector, xpath)?;
84 let after_selection = selected_html.as_deref().unwrap_or(&cleaned);
85
86 let (content_html, cleaned_ref) = if only_main_content && selected_html.is_none() {
88 let main = readability::extract_main_content(after_selection);
89 let re_cleaned = clean::clean_html(&main, true, &[], &[]).unwrap_or(main);
92 (re_cleaned, Some(cleaned))
93 } else {
94 (after_selection.to_string(), None)
95 };
96
97 let md = if formats.contains(&OutputFormat::Markdown) || formats.contains(&OutputFormat::Json) {
99 let md = markdown::html_to_markdown(&content_html);
100 let md_too_short =
103 selected_html.is_none() && md.trim().len() < 100 && raw_html.len() > 5000;
104 if md_too_short {
105 let fallback_md = if only_main_content && selected_html.is_none() {
106 let from_cleaned = cleaned_ref
108 .as_ref()
109 .map(|c| markdown::html_to_markdown(c))
110 .unwrap_or_default();
111
112 let basic_md = {
113 let basic_cleaned =
114 clean::clean_html(raw_html, false, include_tags, exclude_tags)
115 .unwrap_or_else(|_| raw_html.to_string());
116 markdown::html_to_markdown(&basic_cleaned)
117 };
118
119 if from_cleaned.trim().len() >= basic_md.trim().len() {
121 from_cleaned
122 } else {
123 basic_md
124 }
125 } else {
126 markdown::html_to_markdown(raw_html)
127 };
128
129 let fallback_too_short = fallback_md.trim().len() < 100 && raw_html.len() > 5000;
130 if fallback_too_short {
131 let text = plaintext::html_to_plaintext(&content_html);
132 if text.trim().is_empty() {
133 let basic_cleaned =
134 clean::clean_html(raw_html, false, include_tags, exclude_tags)
135 .unwrap_or_else(|_| raw_html.to_string());
136 Some(plaintext::html_to_plaintext(&basic_cleaned))
137 } else {
138 Some(text)
139 }
140 } else {
141 Some(fallback_md)
142 }
143 } else {
144 Some(md)
145 }
146 } else {
147 None
148 };
149
150 let plain = if formats.contains(&OutputFormat::PlainText) {
151 Some(plaintext::html_to_plaintext(&content_html))
152 } else {
153 None
154 };
155
156 let raw = if formats.contains(&OutputFormat::RawHtml) {
157 Some(raw_html.to_string())
158 } else {
159 None
160 };
161
162 let html = if formats.contains(&OutputFormat::Html) {
163 Some(content_html)
164 } else {
165 None
166 };
167
168 let links = if formats.contains(&OutputFormat::Links) {
169 Some(readability::extract_links(raw_html, source_url))
170 } else {
171 None
172 };
173
174 let json = None;
176
177 let orphan_chunk_warning =
179 if chunk_strategy.is_none() && (query.is_some() || filter_mode.is_some()) {
180 Some(
181 "'query' and 'filterMode' require 'chunkStrategy' to be set. \
182 These parameters were ignored."
183 .to_string(),
184 )
185 } else {
186 None
187 };
188
189 let chunks = if let Some(strategy) = chunk_strategy
191 && let Some(ref markdown_text) = md
192 && !markdown_text.trim().is_empty()
193 {
194 let raw_chunks = chunking::chunk_text(markdown_text, strategy);
195
196 let chunk_results = if let (Some(q), Some(mode)) = (query, filter_mode)
198 && !q.trim().is_empty()
199 && !raw_chunks.is_empty()
200 {
201 filter::filter_chunks_scored(&raw_chunks, q, mode, top_k.unwrap_or(5))
202 .into_iter()
203 .map(|sc| ChunkResult {
204 content: sc.content,
205 score: Some(sc.score),
206 index: sc.index,
207 })
208 .collect::<Vec<_>>()
209 } else {
210 let mut results: Vec<_> = raw_chunks
211 .into_iter()
212 .enumerate()
213 .map(|(i, c)| ChunkResult {
214 content: c,
215 score: None,
216 index: i,
217 })
218 .collect();
219 if let Some(k) = top_k {
220 results.truncate(k);
221 }
222 results
223 };
224
225 if chunk_results.is_empty() {
226 None
227 } else {
228 Some(chunk_results)
229 }
230 } else {
231 None
232 };
233
234 Ok(ScrapeData {
235 markdown: md,
236 html,
237 raw_html: raw,
238 plain_text: plain,
239 links,
240 json,
241 chunks,
242 warning: orphan_chunk_warning,
243 metadata: PageMetadata {
244 title: meta.title,
245 description: meta.description,
246 og_title: meta.og_title,
247 og_description: meta.og_description,
248 og_image: meta.og_image,
249 canonical_url: meta.canonical_url,
250 source_url: source_url.to_string(),
251 language: meta.language,
252 status_code,
253 rendered_with,
254 elapsed_ms,
255 },
256 })
257}
258
259fn apply_selector(html: &str, css: Option<&str>, xpath: Option<&str>) -> CrwResult<Option<String>> {
262 if let Some(sel) = css {
263 let result = selector::extract_by_css(html, sel).map_err(CrwError::ExtractionError)?;
264 if result.is_some() {
265 return Ok(result);
266 }
267 }
268 if let Some(xp) = xpath
269 && let Some(texts) =
270 selector::extract_by_xpath(html, xp).map_err(CrwError::ExtractionError)?
271 {
272 let wrapped = texts
273 .into_iter()
274 .map(|text| {
275 let escaped = text
276 .replace('&', "&")
277 .replace('<', "<")
278 .replace('>', ">");
279 format!("<div>{escaped}</div>")
280 })
281 .collect::<Vec<_>>()
282 .join("\n");
283 return Ok(Some(wrapped));
284 }
285 Ok(None)
286}