1use scraper::{Html, Selector, Element};
2use regex::Regex;
3use std::collections::HashSet;
4use crate::table_extractor::{TableExtractor, TableData};
5
6#[derive(Debug, Clone)]
7pub struct ContentExtractor {
8 min_text_length: usize,
9 #[allow(dead_code)]
10 skip_boilerplate: bool,
11}
12
13impl Default for ContentExtractor {
14 fn default() -> Self {
15 Self {
16 min_text_length: 30,
17 skip_boilerplate: true,
18 }
19 }
20}
21
22impl ContentExtractor {
23 pub fn new() -> Self {
24 Self::default()
25 }
26
27 pub fn extract_clean_content(&self, html: &str) -> CleanedContent {
28 let document = Html::parse_document(html);
29
30 let title = self.extract_title(&document);
31 let description = self.extract_meta_description(&document);
32
33 let table_extractor = TableExtractor::new();
35 let tables = table_extractor.extract_tables(html);
36
37 let structured = self.extract_structured_content(&document);
39
40 let (content, word_count) = if structured.is_some() {
42 (None, self.count_words_in_structured(&structured))
43 } else {
44 let main_text = self.extract_main_content_smart(&document);
45 let wc = main_text.split_whitespace().count();
46 (Some(main_text), wc)
47 };
48
49 let links = self.extract_unique_links(&document);
50
51 CleanedContent {
52 title,
53 description,
54 content,
55 structured,
56 tables,
57 links,
58 word_count,
59 }
60 }
61
62 fn extract_title(&self, document: &Html) -> Option<String> {
63 let selector = Selector::parse("title").ok()?;
64 document.select(&selector)
65 .next()
66 .map(|el| self.clean_text(&el.text().collect::<String>()))
67 }
68
69 fn extract_meta_description(&self, document: &Html) -> Option<String> {
70 let selector = Selector::parse("meta[name=\"description\"]").ok()?;
71 document.select(&selector)
72 .next()
73 .and_then(|el| el.value().attr("content"))
74 .map(|s| s.to_string())
75 }
76
77 fn extract_main_content_smart(&self, document: &Html) -> String {
78 let content_selectors = vec![
80 "main", "article", "[role=\"main\"]",
81 ".main-content", "#main-content", ".content",
82 "#content", ".post", ".entry-content",
83 ".article-body", ".story-body"
84 ];
85
86 let skip_selectors = vec![
88 "nav", "header", "footer", ".nav", ".menu",
89 ".sidebar", ".advertisement", ".ads", ".cookie",
90 ".popup", ".modal", ".banner", ".breadcrumb",
91 "#comments", ".comments", ".related", ".social",
92 ".share", ".newsletter", ".subscription"
93 ];
94
95 let mut content_parts = Vec::new();
96 let mut seen_text = HashSet::new();
97
98 for selector_str in content_selectors {
100 if let Ok(selector) = Selector::parse(selector_str) {
101 if let Some(element) = document.select(&selector).next() {
102 let text = self.extract_text_smart(element, &skip_selectors, &mut seen_text);
103 if !text.is_empty() && text.len() > 100 {
104 content_parts.push(text);
105 break;
106 }
107 }
108 }
109 }
110
111 if content_parts.is_empty() {
113 if let Ok(selector) = Selector::parse("body") {
114 if let Some(element) = document.select(&selector).next() {
115 let text = self.extract_text_smart(element, &skip_selectors, &mut seen_text);
116 if !text.is_empty() {
117 content_parts.push(text);
118 }
119 }
120 }
121 }
122
123 content_parts.join("\n\n").trim().to_string()
124 }
125
126 fn extract_text_smart(&self, element: scraper::ElementRef, skip_selectors: &[&str], seen: &mut HashSet<String>) -> String {
127 let mut text_parts = Vec::new();
128
129 for skip_sel in skip_selectors {
131 if let Ok(selector) = Selector::parse(skip_sel) {
132 if element.select(&selector).next().is_some() {
133 return String::new();
134 }
135 }
136 }
137
138 let text_selectors = vec!["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "td", "blockquote"];
140
141 for sel_str in text_selectors {
142 if let Ok(selector) = Selector::parse(sel_str) {
143 for el in element.select(&selector) {
144 let text = el.text().collect::<String>();
145 let cleaned = self.clean_text(&text);
146
147 if cleaned.len() >= self.min_text_length && !seen.contains(&cleaned) {
149 seen.insert(cleaned.clone());
150 text_parts.push(cleaned);
151 }
152 }
153 }
154 }
155
156 text_parts.join(" ")
157 }
158
159 fn extract_structured_content(&self, document: &Html) -> Option<StructuredContent> {
160 let courses = self.extract_courses(document);
161 let sections = self.extract_sections(document);
162 let lists = self.extract_lists(document);
163 let faqs = self.extract_faqs(document);
164
165 if !courses.is_empty() || !sections.is_empty() || !lists.is_empty() || !faqs.is_empty() {
167 Some(StructuredContent {
168 courses,
169 sections,
170 lists,
171 faqs,
172 })
173 } else {
174 None
175 }
176 }
177
178 fn extract_courses(&self, document: &Html) -> Vec<CourseInfo> {
179 let mut courses = Vec::new();
180
181 if let Ok(selector) = Selector::parse(".courseblock, .course-block, .course") {
183 for element in document.select(&selector) {
184 if let Some(course) = self.parse_course_block(element) {
185 courses.push(course);
186 }
187 }
188 }
189
190 if courses.is_empty() {
192 courses = self.extract_courses_from_headings(document);
193 }
194
195 courses
196 }
197
198 fn parse_course_block(&self, element: scraper::ElementRef) -> Option<CourseInfo> {
199 let title_selector = Selector::parse(".course_codetitle, .courseblocktitle, .course-title").ok()?;
201 let title_element = element.select(&title_selector).next()?;
202 let title_text = self.clean_text(&title_element.text().collect::<String>());
203
204 let code_pattern = regex::Regex::new(r"^([A-Z]+\s*\d{3}[A-Z]?):?\s*(.*)").ok()?;
206 let captures = code_pattern.captures(&title_text)?;
207 let code = captures.get(1)?.as_str().trim().to_string();
208 let title = captures.get(2)?.as_str().trim().to_string();
209
210 let mut credits = None;
212 if let Ok(credit_selector) = Selector::parse(".course_credits, .credits") {
213 if let Some(credit_el) = element.select(&credit_selector).next() {
214 credits = Some(self.clean_text(&credit_el.text().collect::<String>()));
215 }
216 }
217
218 let mut description = String::new();
220 if let Ok(desc_selector) = Selector::parse(".courseblockdesc, .course-description, .description") {
221 if let Some(desc_el) = element.select(&desc_selector).next() {
222 description = self.clean_text(&desc_el.text().collect::<String>());
223 }
224 }
225
226 let mut prerequisites = Vec::new();
228 if let Ok(prereq_selector) = Selector::parse(".courseblockextra, .prerequisites") {
229 for prereq_el in element.select(&prereq_selector) {
230 let text = prereq_el.text().collect::<String>();
231 if text.to_lowercase().contains("prerequisite") {
232 prerequisites.extend(self.extract_prerequisites(&text));
233 }
234 }
235 }
236
237 if !description.is_empty() || !title.is_empty() {
238 Some(CourseInfo {
239 code,
240 title,
241 credits,
242 description,
243 prerequisites,
244 })
245 } else {
246 None
247 }
248 }
249
250 #[allow(dead_code)]
251 fn parse_course_element(&self, element: scraper::ElementRef) -> Option<CourseInfo> {
252 let text = element.text().collect::<String>();
253 let code_pattern = regex::Regex::new(r"([A-Z]+\s*\d{3}[A-Z]?)").ok()?;
254
255 if let Some(capture) = code_pattern.find(&text) {
256 let code = capture.as_str().to_string();
257
258 let title_text = text.split(&code).nth(1)?;
260 let title = title_text.split('\n').next()?.trim().to_string();
261
262 let description = text.split('\n')
264 .skip(1)
265 .map(|s| s.trim())
266 .filter(|s| !s.is_empty())
267 .collect::<Vec<_>>()
268 .join(" ");
269
270 Some(CourseInfo {
271 code,
272 title,
273 credits: self.extract_credits(&text),
274 description,
275 prerequisites: self.extract_prerequisites(&text),
276 })
277 } else {
278 None
279 }
280 }
281
282 fn extract_courses_from_headings(&self, document: &Html) -> Vec<CourseInfo> {
283 let mut courses = Vec::new();
284 let code_pattern = regex::Regex::new(r"^([A-Z]+\s*\d{3}[A-Z]?)\s*(.*)").unwrap();
285
286 for level in 2..=5 {
287 let selector_str = format!("h{}", level);
288 if let Ok(selector) = Selector::parse(&selector_str) {
289 for heading in document.select(&selector) {
290 let heading_text = heading.text().collect::<String>();
291
292 if let Some(captures) = code_pattern.captures(&heading_text) {
293 let code = captures.get(1).map_or("", |m| m.as_str()).trim().to_string();
294 let title = captures.get(2).map_or("", |m| m.as_str()).trim().to_string();
295
296 let mut description = String::new();
298 let mut current = heading;
299
300 for _ in 0..5 {
302 if let Some(sibling) = current.next_sibling_element() {
303 let tag = sibling.value().name();
304 if tag == "p" || tag == "div" {
305 description.push_str(&sibling.text().collect::<String>());
306 description.push(' ');
307 } else if tag.starts_with('h') {
308 break; }
310 current = sibling;
311 } else {
312 break;
313 }
314 }
315
316 if !description.is_empty() {
317 courses.push(CourseInfo {
318 code,
319 title,
320 credits: self.extract_credits(&heading_text),
321 description: self.clean_text(&description),
322 prerequisites: self.extract_prerequisites(&description),
323 });
324 }
325 }
326 }
327 };
328 }
329
330 courses
331 }
332
333 #[allow(dead_code)]
334 fn parse_course_dl(&self, _dl: scraper::ElementRef) -> Vec<CourseInfo> {
335 let courses = Vec::new();
336 courses
338 }
339
340 fn extract_credits(&self, text: &str) -> Option<String> {
341 let credit_pattern = regex::Regex::new(r"\((\d+(?:-\d+)?)\s*(?:credits?|cr\.?|units?)\)").ok()?;
342 credit_pattern.find(text).map(|m| m.as_str().to_string())
343 }
344
345 fn extract_prerequisites(&self, text: &str) -> Vec<String> {
346 let mut prereqs = Vec::new();
347 let prereq_pattern = regex::Regex::new(r"(?i)prerequisite[s]?:\s*([^.]+)").unwrap();
348
349 if let Some(captures) = prereq_pattern.captures(text) {
350 let prereq_text = captures.get(1).map_or("", |m| m.as_str());
351 for part in prereq_text.split(&[',', ';', '|'][..]) {
353 let cleaned = self.clean_text(part);
354 if !cleaned.is_empty() {
355 prereqs.push(cleaned);
356 }
357 }
358 }
359
360 prereqs
361 }
362
363 fn extract_sections(&self, document: &Html) -> Vec<ContentSection> {
364 let mut sections = Vec::new();
365
366 if let Ok(selector) = Selector::parse("article, section, .section, .content-section") {
368 for element in document.select(&selector) {
369 if let Some(section) = self.parse_section_element(element) {
370 sections.push(section);
371 }
372 }
373 }
374
375 sections
376 }
377
378 fn parse_section_element(&self, element: scraper::ElementRef) -> Option<ContentSection> {
379 let heading_selector = Selector::parse("h1, h2, h3, h4, h5, h6").ok()?;
381 let heading = element.select(&heading_selector).next()?;
382 let heading_text = self.clean_text(&heading.text().collect::<String>());
383
384 let mut content_parts = Vec::new();
386 if let Ok(p_selector) = Selector::parse("p") {
387 for p in element.select(&p_selector) {
388 let text = self.clean_text(&p.text().collect::<String>());
389 if !text.is_empty() && text.len() > self.min_text_length {
390 content_parts.push(text);
391 }
392 }
393 }
394
395 if !content_parts.is_empty() {
396 Some(ContentSection {
397 heading: heading_text,
398 content: content_parts.join(" "),
399 subsections: Vec::new(),
400 })
401 } else {
402 None
403 }
404 }
405
406 fn extract_lists(&self, document: &Html) -> Vec<ListContent> {
407 let mut lists = Vec::new();
408
409 if let Ok(selector) = Selector::parse("ul, ol") {
411 for list in document.select(&selector) {
412 let parent_html = list.html();
414 if parent_html.contains("nav") || parent_html.contains("menu") ||
415 parent_html.contains("sidebar") || parent_html.contains("breadcrumb") {
416 continue;
417 }
418
419 let is_in_content = self.is_in_content_area(list);
421 if !is_in_content {
422 continue;
423 }
424
425 let mut items = Vec::new();
426
427 if let Ok(li_selector) = Selector::parse("li") {
428 for li in list.select(&li_selector) {
429 let text = self.clean_text(&li.text().collect::<String>());
430 if !text.is_empty() && text.len() > 10 && !text.contains("©") {
432 items.push(text);
433 }
434 }
435 }
436
437 if items.len() > 2 && items.len() < 50 { let title = None;
439 lists.push(ListContent { title, items });
440 }
441 }
442 }
443
444 lists
445 }
446
447 fn is_in_content_area(&self, element: scraper::ElementRef) -> bool {
448 let content_selectors = vec![
450 "main", "article", "[role='main']",
451 ".content", "#content", ".main-content"
452 ];
453
454 let current = element;
456 for _ in 0..10 { for selector_str in &content_selectors {
458 if let Ok(selector) = Selector::parse(selector_str) {
459 if current.select(&selector).next().is_some() {
460 return true;
461 }
462 }
463 }
464
465 break;
468 }
469
470 false
471 }
472
473 fn extract_faqs(&self, document: &Html) -> Vec<FAQItem> {
474 let mut faqs = Vec::new();
475
476 if let Ok(selector) = Selector::parse("dl.faq, dl.faqs, .faq-list dl") {
479 for dl in document.select(&selector) {
480 faqs.extend(self.parse_faq_dl(dl));
481 }
482 }
483
484 if let Ok(selector) = Selector::parse("details, .accordion-item, .faq-item") {
486 for item in document.select(&selector) {
487 if let Some(faq) = self.parse_faq_item(item) {
488 faqs.push(faq);
489 }
490 }
491 }
492
493 faqs
494 }
495
496 fn parse_faq_dl(&self, dl: scraper::ElementRef) -> Vec<FAQItem> {
497 let mut faqs = Vec::new();
498
499 if let (Ok(dt_sel), Ok(dd_sel)) = (Selector::parse("dt"), Selector::parse("dd")) {
500 let questions: Vec<_> = dl.select(&dt_sel).collect();
501 let answers: Vec<_> = dl.select(&dd_sel).collect();
502
503 for (q, a) in questions.iter().zip(answers.iter()) {
504 let question = self.clean_text(&q.text().collect::<String>());
505 let answer = self.clean_text(&a.text().collect::<String>());
506
507 if !question.is_empty() && !answer.is_empty() {
508 faqs.push(FAQItem { question, answer });
509 }
510 }
511 }
512
513 faqs
514 }
515
516 fn parse_faq_item(&self, element: scraper::ElementRef) -> Option<FAQItem> {
517 if element.value().name() == "details" {
519 if let Ok(summary_sel) = Selector::parse("summary") {
520 if let Some(summary) = element.select(&summary_sel).next() {
521 let question = self.clean_text(&summary.text().collect::<String>());
522
523 let mut answer_parts = Vec::new();
525 for child in element.children() {
526 if let Some(el) = child.value().as_element() {
527 if el.name() != "summary" {
528 if let Some(text_el) = child.value().as_text() {
529 answer_parts.push(text_el.to_string());
530 }
531 }
532 }
533 }
534
535 let answer = self.clean_text(&answer_parts.join(" "));
536 if !question.is_empty() && !answer.is_empty() {
537 return Some(FAQItem { question, answer });
538 }
539 }
540 }
541 }
542
543 None
544 }
545
546 fn count_words_in_structured(&self, structured: &Option<StructuredContent>) -> usize {
547 if let Some(s) = structured {
548 let mut count = 0;
549
550 for course in &s.courses {
551 count += course.title.split_whitespace().count();
552 count += course.description.split_whitespace().count();
553 }
554
555 for section in &s.sections {
556 count += section.heading.split_whitespace().count();
557 count += section.content.split_whitespace().count();
558 }
559
560 for list in &s.lists {
561 for item in &list.items {
562 count += item.split_whitespace().count();
563 }
564 }
565
566 for faq in &s.faqs {
567 count += faq.question.split_whitespace().count();
568 count += faq.answer.split_whitespace().count();
569 }
570
571 count
572 } else {
573 0
574 }
575 }
576
577 fn extract_unique_links(&self, document: &Html) -> Vec<String> {
578 let mut unique_links = HashSet::new();
579
580 if let Ok(selector) = Selector::parse("a[href]") {
581 for element in document.select(&selector) {
582 if let Some(href) = element.value().attr("href") {
583 if !href.starts_with('#') && !href.starts_with("javascript:") {
585 if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("/") {
587 unique_links.insert(href.to_string());
588 }
589 }
590 }
591 }
592 }
593
594 let mut links: Vec<String> = unique_links.into_iter().collect();
595 links.sort();
596 links.truncate(50); links
598 }
599
600
601 fn clean_text(&self, text: &str) -> String {
602 let re = Regex::new(r"\s+").unwrap();
604 let cleaned = re.replace_all(text.trim(), " ");
605 cleaned.to_string()
606 }
607}
608
609#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
610pub struct CleanedContent {
611 #[serde(skip_serializing_if = "Option::is_none")]
612 pub title: Option<String>,
613 #[serde(skip_serializing_if = "Option::is_none")]
614 pub description: Option<String>,
615 #[serde(skip_serializing_if = "Option::is_none")]
616 pub content: Option<String>,
617 #[serde(skip_serializing_if = "Option::is_none")]
618 pub structured: Option<StructuredContent>,
619 #[serde(skip_serializing_if = "Vec::is_empty")]
620 pub tables: Vec<TableData>,
621 #[serde(skip_serializing_if = "Vec::is_empty")]
622 pub links: Vec<String>,
623 pub word_count: usize,
624}
625
626#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
627pub struct StructuredContent {
628 #[serde(skip_serializing_if = "Vec::is_empty")]
629 pub courses: Vec<CourseInfo>,
630 #[serde(skip_serializing_if = "Vec::is_empty")]
631 pub sections: Vec<ContentSection>,
632 #[serde(skip_serializing_if = "Vec::is_empty")]
633 pub lists: Vec<ListContent>,
634 #[serde(skip_serializing_if = "Vec::is_empty")]
635 pub faqs: Vec<FAQItem>,
636}
637
638#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
639pub struct CourseInfo {
640 pub code: String,
641 pub title: String,
642 #[serde(skip_serializing_if = "Option::is_none")]
643 pub credits: Option<String>,
644 pub description: String,
645 #[serde(skip_serializing_if = "Vec::is_empty")]
646 pub prerequisites: Vec<String>,
647}
648
649#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
650pub struct ContentSection {
651 pub heading: String,
652 pub content: String,
653 #[serde(skip_serializing_if = "Vec::is_empty")]
654 pub subsections: Vec<ContentSection>,
655}
656
657#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
658pub struct ListContent {
659 pub title: Option<String>,
660 pub items: Vec<String>,
661}
662
663#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
664pub struct FAQItem {
665 pub question: String,
666 pub answer: String,
667}
668