Skip to main content

orbok_extract/
html.rs

1//! HTML text extractor.
2//!
3//! Strips HTML tags with a simple state-machine parser and preserves
4//! visible text content. Block-level elements (`p`, `div`, `h1`–`h6`,
5//! `li`, `td`, `th`, `br`) produce paragraph boundaries.
6//! `<h1>`–`<h6>` headings populate `heading_path`.
7//!
8//! Security: no JavaScript execution, no external resource loading,
9//! no DOM construction. Pure text extraction only (RFC-015 §15).
10
11use crate::normalize::normalize_document;
12use crate::types::{
13    DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17
18const EXTRACTOR_NAME: &str = "html";
19const EXTRACTOR_VERSION: &str = "v1";
20
21pub struct HtmlExtractor;
22
23impl DocumentExtractor for HtmlExtractor {
24    fn name(&self) -> &'static str {
25        EXTRACTOR_NAME
26    }
27    fn version(&self) -> &'static str {
28        EXTRACTOR_VERSION
29    }
30    fn supported_extensions(&self) -> &'static [&'static str] {
31        &["html", "htm"]
32    }
33
34    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
35        let content = std::fs::read_to_string(&path.canonical)?;
36        let blocks = extract_blocks(&content);
37        let mut segments = Vec::new();
38        let mut total_chars = 0u64;
39        let mut line = 1u32;
40
41        for block in &blocks {
42            let norm = normalize_document(&block.text);
43            if norm.trim().is_empty() {
44                line += 1;
45                continue;
46            }
47            total_chars += norm.len() as u64;
48            segments.push(ExtractedSegment {
49                kind: block.kind,
50                text: norm,
51                line_start: line,
52                line_end: line,
53                heading_path: block.heading.clone(),
54                location_quality: LocationQuality::Approximate,
55            });
56            line += 1;
57        }
58
59        Ok(ExtractOutput {
60            extractor_name: EXTRACTOR_NAME.to_string(),
61            extractor_version: EXTRACTOR_VERSION.to_string(),
62            normalization_version: NORMALIZATION_VERSION.to_string(),
63            segments,
64            char_count: total_chars,
65        })
66    }
67}
68
69struct Block {
70    text: String,
71    kind: SegmentKind,
72    heading: Option<String>,
73}
74
75/// Block-level elements that cause paragraph breaks.
76const BLOCK_TAGS: &[&str] = &[
77    "p",
78    "div",
79    "section",
80    "article",
81    "aside",
82    "h1",
83    "h2",
84    "h3",
85    "h4",
86    "h5",
87    "h6",
88    "li",
89    "dt",
90    "dd",
91    "td",
92    "th",
93    "caption",
94    "blockquote",
95    "pre",
96    "br",
97];
98const HEADING_TAGS: &[&str] = &["h1", "h2", "h3", "h4", "h5", "h6"];
99/// Tags whose content should be suppressed entirely.
100const SKIP_TAGS: &[&str] = &["script", "style", "head", "noscript", "template"];
101
102fn extract_blocks(html: &str) -> Vec<Block> {
103    let mut blocks = Vec::new();
104    let mut current_text = String::new();
105    let mut current_heading: Option<String> = None;
106    let mut heading_trail: Vec<String> = Vec::new();
107    let mut skip_depth: usize = 0;
108    let mut skip_tag = "";
109    let mut pos = 0;
110    let chars: Vec<char> = html.chars().collect();
111    let n = chars.len();
112
113    let flush = |text: &mut String, heading: &Option<String>, blocks: &mut Vec<Block>| {
114        let t = text.trim().to_string();
115        if !t.is_empty() {
116            blocks.push(Block {
117                text: t,
118                kind: SegmentKind::Paragraph,
119                heading: heading.clone(),
120            });
121        }
122        text.clear();
123    };
124
125    while pos < n {
126        if chars[pos] == '<' {
127            // Collect tag
128            let mut tag_end = pos + 1;
129            while tag_end < n && chars[tag_end] != '>' {
130                tag_end += 1;
131            }
132            let tag_str: String = chars[pos..tag_end.min(n)].iter().collect();
133            let is_close = tag_str.starts_with("</");
134            let tag_name = tag_str
135                .trim_start_matches('<')
136                .trim_start_matches('/')
137                .split_whitespace()
138                .next()
139                .unwrap_or("")
140                .to_ascii_lowercase();
141
142            if skip_depth > 0 {
143                if is_close && tag_name == skip_tag {
144                    skip_depth -= 1;
145                } else if !is_close && tag_name == skip_tag {
146                    skip_depth += 1;
147                }
148            } else if !is_close && SKIP_TAGS.contains(&tag_name.as_str()) {
149                skip_tag = SKIP_TAGS
150                    .iter()
151                    .find(|&&t| t == tag_name.as_str())
152                    .copied()
153                    .unwrap_or("");
154                skip_depth = 1;
155                flush(&mut current_text, &current_heading, &mut blocks);
156            } else if is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
157                // Closing a heading: record it and update heading_path context.
158                let heading_text = current_text.trim().to_string();
159                if !heading_text.is_empty() {
160                    let level = tag_name[1..].parse::<usize>().unwrap_or(1);
161                    heading_trail.truncate(level.saturating_sub(1));
162                    heading_trail.push(heading_text.clone());
163                    current_heading = Some(heading_trail.join(" > "));
164                    blocks.push(Block {
165                        text: heading_text,
166                        kind: SegmentKind::Heading,
167                        heading: current_heading.clone(),
168                    });
169                    current_text.clear();
170                }
171            } else if BLOCK_TAGS.contains(&tag_name.as_str()) {
172                flush(&mut current_text, &current_heading, &mut blocks);
173                if !is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
174                    let level = tag_name[1..].parse::<usize>().unwrap_or(1);
175                    heading_trail.truncate(level.saturating_sub(1));
176                }
177            }
178            pos = tag_end + 1;
179        } else if skip_depth == 0 {
180            // Decode common HTML entities inline.
181            let c = chars[pos];
182            if c == '&' {
183                let semi = chars[pos..].iter().position(|&x| x == ';').map(|p| pos + p);
184                if let Some(end) = semi {
185                    let entity: String = chars[pos..=end].iter().collect();
186                    match entity.as_str() {
187                        "&amp;" => current_text.push('&'),
188                        "&lt;" => current_text.push('<'),
189                        "&gt;" => current_text.push('>'),
190                        "&quot;" | "&ldquo;" | "&rdquo;" => current_text.push('"'),
191                        "&apos;" | "&lsquo;" | "&rsquo;" => current_text.push('\''),
192                        "&nbsp;" | "&#160;" => current_text.push(' '),
193                        _ => current_text.push_str(&entity),
194                    }
195                    pos = end + 1;
196                    continue;
197                }
198            }
199            current_text.push(c);
200            pos += 1;
201        } else {
202            pos += 1;
203        }
204    }
205    flush(&mut current_text, &current_heading, &mut blocks);
206    blocks
207}