omnivore_core/
extractor.rs

1use scraper::{Html, Selector, Element};
2use regex::Regex;
3use std::collections::HashSet;
4use crate::table_extractor::{TableExtractor, TableData};
5
6#[derive(Debug, Clone)]
7pub struct ContentExtractor {
8    min_text_length: usize,
9    #[allow(dead_code)]
10    skip_boilerplate: bool,
11}
12
13impl Default for ContentExtractor {
14    fn default() -> Self {
15        Self {
16            min_text_length: 30,
17            skip_boilerplate: true,
18        }
19    }
20}
21
22impl ContentExtractor {
23    pub fn new() -> Self {
24        Self::default()
25    }
26
27    pub fn extract_clean_content(&self, html: &str) -> CleanedContent {
28        let document = Html::parse_document(html);
29        
30        let title = self.extract_title(&document);
31        let description = self.extract_meta_description(&document);
32        
33        // Extract tables
34        let table_extractor = TableExtractor::new();
35        let tables = table_extractor.extract_tables(html);
36        
37        // Check for structured content patterns
38        let structured = self.extract_structured_content(&document);
39        
40        // Extract main content if no structured content found
41        let (content, word_count) = if structured.is_some() {
42            (None, self.count_words_in_structured(&structured))
43        } else {
44            let main_text = self.extract_main_content_smart(&document);
45            let wc = main_text.split_whitespace().count();
46            (Some(main_text), wc)
47        };
48        
49        let links = self.extract_unique_links(&document);
50        
51        CleanedContent {
52            title,
53            description,
54            content,
55            structured,
56            tables,
57            links,
58            word_count,
59        }
60    }
61
62    fn extract_title(&self, document: &Html) -> Option<String> {
63        let selector = Selector::parse("title").ok()?;
64        document.select(&selector)
65            .next()
66            .map(|el| self.clean_text(&el.text().collect::<String>()))
67    }
68
69    fn extract_meta_description(&self, document: &Html) -> Option<String> {
70        let selector = Selector::parse("meta[name=\"description\"]").ok()?;
71        document.select(&selector)
72            .next()
73            .and_then(|el| el.value().attr("content"))
74            .map(|s| s.to_string())
75    }
76
77    fn extract_main_content_smart(&self, document: &Html) -> String {
78        // List of selectors for main content areas
79        let content_selectors = vec![
80            "main", "article", "[role=\"main\"]", 
81            ".main-content", "#main-content", ".content",
82            "#content", ".post", ".entry-content",
83            ".article-body", ".story-body"
84        ];
85        
86        // List of selectors to skip (boilerplate)
87        let skip_selectors = vec![
88            "nav", "header", "footer", ".nav", ".menu",
89            ".sidebar", ".advertisement", ".ads", ".cookie",
90            ".popup", ".modal", ".banner", ".breadcrumb",
91            "#comments", ".comments", ".related", ".social",
92            ".share", ".newsletter", ".subscription"
93        ];
94        
95        let mut content_parts = Vec::new();
96        let mut seen_text = HashSet::new();
97        
98        // Try to find main content area
99        for selector_str in content_selectors {
100            if let Ok(selector) = Selector::parse(selector_str) {
101                if let Some(element) = document.select(&selector).next() {
102                    let text = self.extract_text_smart(element, &skip_selectors, &mut seen_text);
103                    if !text.is_empty() && text.len() > 100 {
104                        content_parts.push(text);
105                        break;
106                    }
107                }
108            }
109        }
110        
111        // Fallback to body if no main content found
112        if content_parts.is_empty() {
113            if let Ok(selector) = Selector::parse("body") {
114                if let Some(element) = document.select(&selector).next() {
115                    let text = self.extract_text_smart(element, &skip_selectors, &mut seen_text);
116                    if !text.is_empty() {
117                        content_parts.push(text);
118                    }
119                }
120            }
121        }
122        
123        content_parts.join("\n\n").trim().to_string()
124    }
125    
126    fn extract_text_smart(&self, element: scraper::ElementRef, skip_selectors: &[&str], seen: &mut HashSet<String>) -> String {
127        let mut text_parts = Vec::new();
128        
129        // Check if this element should be skipped
130        for skip_sel in skip_selectors {
131            if let Ok(selector) = Selector::parse(skip_sel) {
132                if element.select(&selector).next().is_some() {
133                    return String::new();
134                }
135            }
136        }
137        
138        // Extract text from paragraphs and headings
139        let text_selectors = vec!["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "td", "blockquote"];
140        
141        for sel_str in text_selectors {
142            if let Ok(selector) = Selector::parse(sel_str) {
143                for el in element.select(&selector) {
144                    let text = el.text().collect::<String>();
145                    let cleaned = self.clean_text(&text);
146                    
147                    // Skip if too short or already seen
148                    if cleaned.len() >= self.min_text_length && !seen.contains(&cleaned) {
149                        seen.insert(cleaned.clone());
150                        text_parts.push(cleaned);
151                    }
152                }
153            }
154        }
155        
156        text_parts.join(" ")
157    }
158    
159    fn extract_structured_content(&self, document: &Html) -> Option<StructuredContent> {
160        let courses = self.extract_courses(document);
161        let sections = self.extract_sections(document);
162        let lists = self.extract_lists(document);
163        let faqs = self.extract_faqs(document);
164        
165        // Only return structured content if we found something
166        if !courses.is_empty() || !sections.is_empty() || !lists.is_empty() || !faqs.is_empty() {
167            Some(StructuredContent {
168                courses,
169                sections,
170                lists,
171                faqs,
172            })
173        } else {
174            None
175        }
176    }
177    
178    fn extract_courses(&self, document: &Html) -> Vec<CourseInfo> {
179        let mut courses = Vec::new();
180        
181        // Pattern 1: Course blocks (like Penn State bulletins)
182        if let Ok(selector) = Selector::parse(".courseblock, .course-block, .course") {
183            for element in document.select(&selector) {
184                if let Some(course) = self.parse_course_block(element) {
185                    courses.push(course);
186                }
187            }
188        }
189        
190        // Pattern 2: Look for course code patterns in headings if no blocks found
191        if courses.is_empty() {
192            courses = self.extract_courses_from_headings(document);
193        }
194        
195        courses
196    }
197    
198    fn parse_course_block(&self, element: scraper::ElementRef) -> Option<CourseInfo> {
199        // Extract course code and title from courseblocktitle
200        let title_selector = Selector::parse(".course_codetitle, .courseblocktitle, .course-title").ok()?;
201        let title_element = element.select(&title_selector).next()?;
202        let title_text = self.clean_text(&title_element.text().collect::<String>());
203        
204        // Parse course code and title
205        let code_pattern = regex::Regex::new(r"^([A-Z]+\s*\d{3}[A-Z]?):?\s*(.*)").ok()?;
206        let captures = code_pattern.captures(&title_text)?;
207        let code = captures.get(1)?.as_str().trim().to_string();
208        let title = captures.get(2)?.as_str().trim().to_string();
209        
210        // Extract credits
211        let mut credits = None;
212        if let Ok(credit_selector) = Selector::parse(".course_credits, .credits") {
213            if let Some(credit_el) = element.select(&credit_selector).next() {
214                credits = Some(self.clean_text(&credit_el.text().collect::<String>()));
215            }
216        }
217        
218        // Extract description from courseblockmeta
219        let mut description = String::new();
220        if let Ok(desc_selector) = Selector::parse(".courseblockdesc, .course-description, .description") {
221            if let Some(desc_el) = element.select(&desc_selector).next() {
222                description = self.clean_text(&desc_el.text().collect::<String>());
223            }
224        }
225        
226        // Extract prerequisites
227        let mut prerequisites = Vec::new();
228        if let Ok(prereq_selector) = Selector::parse(".courseblockextra, .prerequisites") {
229            for prereq_el in element.select(&prereq_selector) {
230                let text = prereq_el.text().collect::<String>();
231                if text.to_lowercase().contains("prerequisite") {
232                    prerequisites.extend(self.extract_prerequisites(&text));
233                }
234            }
235        }
236        
237        if !description.is_empty() || !title.is_empty() {
238            Some(CourseInfo {
239                code,
240                title,
241                credits,
242                description,
243                prerequisites,
244            })
245        } else {
246            None
247        }
248    }
249    
250    #[allow(dead_code)]
251    fn parse_course_element(&self, element: scraper::ElementRef) -> Option<CourseInfo> {
252        let text = element.text().collect::<String>();
253        let code_pattern = regex::Regex::new(r"([A-Z]+\s*\d{3}[A-Z]?)").ok()?;
254        
255        if let Some(capture) = code_pattern.find(&text) {
256            let code = capture.as_str().to_string();
257            
258            // Try to extract title after the code
259            let title_text = text.split(&code).nth(1)?;
260            let title = title_text.split('\n').next()?.trim().to_string();
261            
262            // Extract description
263            let description = text.split('\n')
264                .skip(1)
265                .map(|s| s.trim())
266                .filter(|s| !s.is_empty())
267                .collect::<Vec<_>>()
268                .join(" ");
269            
270            Some(CourseInfo {
271                code,
272                title,
273                credits: self.extract_credits(&text),
274                description,
275                prerequisites: self.extract_prerequisites(&text),
276            })
277        } else {
278            None
279        }
280    }
281    
282    fn extract_courses_from_headings(&self, document: &Html) -> Vec<CourseInfo> {
283        let mut courses = Vec::new();
284        let code_pattern = regex::Regex::new(r"^([A-Z]+\s*\d{3}[A-Z]?)\s*(.*)").unwrap();
285        
286        for level in 2..=5 {
287            let selector_str = format!("h{}", level);
288            if let Ok(selector) = Selector::parse(&selector_str) {
289                for heading in document.select(&selector) {
290                    let heading_text = heading.text().collect::<String>();
291                    
292                    if let Some(captures) = code_pattern.captures(&heading_text) {
293                        let code = captures.get(1).map_or("", |m| m.as_str()).trim().to_string();
294                        let title = captures.get(2).map_or("", |m| m.as_str()).trim().to_string();
295                        
296                        // Get the next sibling content as description
297                        let mut description = String::new();
298                        let mut current = heading;
299                        
300                        // Look for next few siblings for description
301                        for _ in 0..5 {
302                            if let Some(sibling) = current.next_sibling_element() {
303                                let tag = sibling.value().name();
304                                if tag == "p" || tag == "div" {
305                                    description.push_str(&sibling.text().collect::<String>());
306                                    description.push(' ');
307                                } else if tag.starts_with('h') {
308                                    break; // Stop at next heading
309                                }
310                                current = sibling;
311                            } else {
312                                break;
313                            }
314                        }
315                        
316                        if !description.is_empty() {
317                            courses.push(CourseInfo {
318                                code,
319                                title,
320                                credits: self.extract_credits(&heading_text),
321                                description: self.clean_text(&description),
322                                prerequisites: self.extract_prerequisites(&description),
323                            });
324                        }
325                    }
326                }
327            };
328        }
329        
330        courses
331    }
332    
333    #[allow(dead_code)]
334    fn parse_course_dl(&self, _dl: scraper::ElementRef) -> Vec<CourseInfo> {
335        let courses = Vec::new();
336        // TODO: Implementation for definition list parsing
337        courses
338    }
339    
340    fn extract_credits(&self, text: &str) -> Option<String> {
341        let credit_pattern = regex::Regex::new(r"\((\d+(?:-\d+)?)\s*(?:credits?|cr\.?|units?)\)").ok()?;
342        credit_pattern.find(text).map(|m| m.as_str().to_string())
343    }
344    
345    fn extract_prerequisites(&self, text: &str) -> Vec<String> {
346        let mut prereqs = Vec::new();
347        let prereq_pattern = regex::Regex::new(r"(?i)prerequisite[s]?:\s*([^.]+)").unwrap();
348        
349        if let Some(captures) = prereq_pattern.captures(text) {
350            let prereq_text = captures.get(1).map_or("", |m| m.as_str());
351            // Split by common delimiters
352            for part in prereq_text.split(&[',', ';', '|'][..]) {
353                let cleaned = self.clean_text(part);
354                if !cleaned.is_empty() {
355                    prereqs.push(cleaned);
356                }
357            }
358        }
359        
360        prereqs
361    }
362    
363    fn extract_sections(&self, document: &Html) -> Vec<ContentSection> {
364        let mut sections = Vec::new();
365        
366        // Look for article, section tags with headings
367        if let Ok(selector) = Selector::parse("article, section, .section, .content-section") {
368            for element in document.select(&selector) {
369                if let Some(section) = self.parse_section_element(element) {
370                    sections.push(section);
371                }
372            }
373        }
374        
375        sections
376    }
377    
378    fn parse_section_element(&self, element: scraper::ElementRef) -> Option<ContentSection> {
379        // Find heading in section
380        let heading_selector = Selector::parse("h1, h2, h3, h4, h5, h6").ok()?;
381        let heading = element.select(&heading_selector).next()?;
382        let heading_text = self.clean_text(&heading.text().collect::<String>());
383        
384        // Get content excluding the heading
385        let mut content_parts = Vec::new();
386        if let Ok(p_selector) = Selector::parse("p") {
387            for p in element.select(&p_selector) {
388                let text = self.clean_text(&p.text().collect::<String>());
389                if !text.is_empty() && text.len() > self.min_text_length {
390                    content_parts.push(text);
391                }
392            }
393        }
394        
395        if !content_parts.is_empty() {
396            Some(ContentSection {
397                heading: heading_text,
398                content: content_parts.join(" "),
399                subsections: Vec::new(),
400            })
401        } else {
402            None
403        }
404    }
405    
406    fn extract_lists(&self, document: &Html) -> Vec<ListContent> {
407        let mut lists = Vec::new();
408        
409        // Extract ul and ol lists, but filter out navigation
410        if let Ok(selector) = Selector::parse("ul, ol") {
411            for list in document.select(&selector) {
412                // Skip navigation lists
413                let parent_html = list.html();
414                if parent_html.contains("nav") || parent_html.contains("menu") || 
415                   parent_html.contains("sidebar") || parent_html.contains("breadcrumb") {
416                    continue;
417                }
418                
419                // Check if list is inside main content area
420                let is_in_content = self.is_in_content_area(list);
421                if !is_in_content {
422                    continue;
423                }
424                
425                let mut items = Vec::new();
426                
427                if let Ok(li_selector) = Selector::parse("li") {
428                    for li in list.select(&li_selector) {
429                        let text = self.clean_text(&li.text().collect::<String>());
430                        // Filter out short navigation-like items
431                        if !text.is_empty() && text.len() > 10 && !text.contains("©") {
432                            items.push(text);
433                        }
434                    }
435                }
436                
437                if items.len() > 2 && items.len() < 50 {  // Filter out huge navigation lists
438                    let title = None;
439                    lists.push(ListContent { title, items });
440                }
441            }
442        }
443        
444        lists
445    }
446    
447    fn is_in_content_area(&self, element: scraper::ElementRef) -> bool {
448        // Check if element is within a content area
449        let content_selectors = vec![
450            "main", "article", "[role='main']", 
451            ".content", "#content", ".main-content"
452        ];
453        
454        // Walk up the DOM tree to check for content containers
455        let current = element;
456        for _ in 0..10 {  // Check up to 10 levels
457            for selector_str in &content_selectors {
458                if let Ok(selector) = Selector::parse(selector_str) {
459                    if current.select(&selector).next().is_some() {
460                        return true;
461                    }
462                }
463            }
464            
465            // Try to get parent - this is a simplified check
466            // In real implementation, we'd need proper parent traversal
467            break;
468        }
469        
470        false
471    }
472    
473    fn extract_faqs(&self, document: &Html) -> Vec<FAQItem> {
474        let mut faqs = Vec::new();
475        
476        // Look for FAQ patterns
477        // Pattern 1: FAQ in dl/dt/dd format
478        if let Ok(selector) = Selector::parse("dl.faq, dl.faqs, .faq-list dl") {
479            for dl in document.select(&selector) {
480                faqs.extend(self.parse_faq_dl(dl));
481            }
482        }
483        
484        // Pattern 2: Accordion/details elements
485        if let Ok(selector) = Selector::parse("details, .accordion-item, .faq-item") {
486            for item in document.select(&selector) {
487                if let Some(faq) = self.parse_faq_item(item) {
488                    faqs.push(faq);
489                }
490            }
491        }
492        
493        faqs
494    }
495    
496    fn parse_faq_dl(&self, dl: scraper::ElementRef) -> Vec<FAQItem> {
497        let mut faqs = Vec::new();
498        
499        if let (Ok(dt_sel), Ok(dd_sel)) = (Selector::parse("dt"), Selector::parse("dd")) {
500            let questions: Vec<_> = dl.select(&dt_sel).collect();
501            let answers: Vec<_> = dl.select(&dd_sel).collect();
502            
503            for (q, a) in questions.iter().zip(answers.iter()) {
504                let question = self.clean_text(&q.text().collect::<String>());
505                let answer = self.clean_text(&a.text().collect::<String>());
506                
507                if !question.is_empty() && !answer.is_empty() {
508                    faqs.push(FAQItem { question, answer });
509                }
510            }
511        }
512        
513        faqs
514    }
515    
516    fn parse_faq_item(&self, element: scraper::ElementRef) -> Option<FAQItem> {
517        // For details/summary pattern
518        if element.value().name() == "details" {
519            if let Ok(summary_sel) = Selector::parse("summary") {
520                if let Some(summary) = element.select(&summary_sel).next() {
521                    let question = self.clean_text(&summary.text().collect::<String>());
522                    
523                    // Get answer from remaining content
524                    let mut answer_parts = Vec::new();
525                    for child in element.children() {
526                        if let Some(el) = child.value().as_element() {
527                            if el.name() != "summary" {
528                                if let Some(text_el) = child.value().as_text() {
529                                    answer_parts.push(text_el.to_string());
530                                }
531                            }
532                        }
533                    }
534                    
535                    let answer = self.clean_text(&answer_parts.join(" "));
536                    if !question.is_empty() && !answer.is_empty() {
537                        return Some(FAQItem { question, answer });
538                    }
539                }
540            }
541        }
542        
543        None
544    }
545    
546    fn count_words_in_structured(&self, structured: &Option<StructuredContent>) -> usize {
547        if let Some(s) = structured {
548            let mut count = 0;
549            
550            for course in &s.courses {
551                count += course.title.split_whitespace().count();
552                count += course.description.split_whitespace().count();
553            }
554            
555            for section in &s.sections {
556                count += section.heading.split_whitespace().count();
557                count += section.content.split_whitespace().count();
558            }
559            
560            for list in &s.lists {
561                for item in &list.items {
562                    count += item.split_whitespace().count();
563                }
564            }
565            
566            for faq in &s.faqs {
567                count += faq.question.split_whitespace().count();
568                count += faq.answer.split_whitespace().count();
569            }
570            
571            count
572        } else {
573            0
574        }
575    }
576    
577    fn extract_unique_links(&self, document: &Html) -> Vec<String> {
578        let mut unique_links = HashSet::new();
579        
580        if let Ok(selector) = Selector::parse("a[href]") {
581            for element in document.select(&selector) {
582                if let Some(href) = element.value().attr("href") {
583                    // Skip internal anchors and javascript
584                    if !href.starts_with('#') && !href.starts_with("javascript:") {
585                        // Only include http/https links
586                        if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("/") {
587                            unique_links.insert(href.to_string());
588                        }
589                    }
590                }
591            }
592        }
593        
594        let mut links: Vec<String> = unique_links.into_iter().collect();
595        links.sort();
596        links.truncate(50); // Limit to 50 most relevant links
597        links
598    }
599
600
601    fn clean_text(&self, text: &str) -> String {
602        // Remove excessive whitespace and normalize
603        let re = Regex::new(r"\s+").unwrap();
604        let cleaned = re.replace_all(text.trim(), " ");
605        cleaned.to_string()
606    }
607}
608
609#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
610pub struct CleanedContent {
611    #[serde(skip_serializing_if = "Option::is_none")]
612    pub title: Option<String>,
613    #[serde(skip_serializing_if = "Option::is_none")]
614    pub description: Option<String>,
615    #[serde(skip_serializing_if = "Option::is_none")]
616    pub content: Option<String>,
617    #[serde(skip_serializing_if = "Option::is_none")]
618    pub structured: Option<StructuredContent>,
619    #[serde(skip_serializing_if = "Vec::is_empty")]
620    pub tables: Vec<TableData>,
621    #[serde(skip_serializing_if = "Vec::is_empty")]
622    pub links: Vec<String>,
623    pub word_count: usize,
624}
625
626#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
627pub struct StructuredContent {
628    #[serde(skip_serializing_if = "Vec::is_empty")]
629    pub courses: Vec<CourseInfo>,
630    #[serde(skip_serializing_if = "Vec::is_empty")]
631    pub sections: Vec<ContentSection>,
632    #[serde(skip_serializing_if = "Vec::is_empty")]
633    pub lists: Vec<ListContent>,
634    #[serde(skip_serializing_if = "Vec::is_empty")]
635    pub faqs: Vec<FAQItem>,
636}
637
638#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
639pub struct CourseInfo {
640    pub code: String,
641    pub title: String,
642    #[serde(skip_serializing_if = "Option::is_none")]
643    pub credits: Option<String>,
644    pub description: String,
645    #[serde(skip_serializing_if = "Vec::is_empty")]
646    pub prerequisites: Vec<String>,
647}
648
649#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
650pub struct ContentSection {
651    pub heading: String,
652    pub content: String,
653    #[serde(skip_serializing_if = "Vec::is_empty")]
654    pub subsections: Vec<ContentSection>,
655}
656
657#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
658pub struct ListContent {
659    pub title: Option<String>,
660    pub items: Vec<String>,
661}
662
663#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
664pub struct FAQItem {
665    pub question: String,
666    pub answer: String,
667}
668