Skip to main content

orbok_extract/
html.rs

1//! HTML text extractor (RFC-005 §5; RFC-044 §16.3 resource limits).
2//!
3//! Strips HTML tags with a simple state-machine parser and preserves
4//! visible text content. Block-level elements produce paragraph
5//! boundaries. `<h1>`–`<h6>` headings populate `heading_path`.
6//!
7//! Security: no JavaScript execution, no external resource loading,
8//! no DOM construction. Pure text extraction only (RFC-015 §15).
9
10use crate::normalize::normalize_document;
11use crate::types::{
12    DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
13    LocationKind, LocationQuality, SegmentKind, read_error_category,
14};
15use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17
18const EXTRACTOR_NAME: &str = "html";
19const EXTRACTOR_VERSION: &str = "v1";
20
21pub struct HtmlExtractor;
22
23impl DocumentExtractor for HtmlExtractor {
24    fn name(&self) -> &'static str {
25        EXTRACTOR_NAME
26    }
27
28    fn version(&self) -> &'static str {
29        EXTRACTOR_VERSION
30    }
31
32    fn supported_extensions(&self) -> &'static [&'static str] {
33        &["html", "htm"]
34    }
35
36    fn extract_with_context(
37        &self,
38        path: &ValidatedPath,
39        context: &ExtractContext,
40    ) -> OrbokResult<ExtractOutput> {
41        let limits = &context.limits;
42        let mut warnings = Vec::new();
43
44        // RFC-044 §9.5: check file size before reading.
45        let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
46            category: read_error_category(&e),
47            message: e.to_string(),
48        })?;
49        if meta.len() > limits.max_html_bytes {
50            return Err(OrbokError::Extraction {
51                category: ErrorCategory::FileTooLarge,
52                message: format!(
53                    "HTML file is {} bytes, limit is {}",
54                    meta.len(),
55                    limits.max_html_bytes
56                ),
57            });
58        }
59
60        let content =
61            std::fs::read_to_string(&path.canonical).map_err(|e| OrbokError::Extraction {
62                category: read_error_category(&e),
63                message: e.to_string(),
64            })?;
65
66        let blocks = extract_blocks(&content);
67        let mut segments = Vec::new();
68        let mut total_chars = 0u64;
69        let mut block_idx = 1u32;
70
71        for block in &blocks {
72            let norm = normalize_document(&block.text);
73            if norm.trim().is_empty() {
74                block_idx += 1;
75                continue;
76            }
77
78            // RFC-044 §9.5: extracted char limit — stop and warn.
79            let block_chars = norm.chars().count() as u64;
80            if total_chars + block_chars > limits.max_extracted_chars {
81                warnings.push(ExtractWarning::SizeLimitReached {
82                    limit_name: "max_extracted_chars".into(),
83                });
84                break;
85            }
86            total_chars += block_chars;
87
88            segments.push(ExtractedSegment {
89                kind: block.kind,
90                text: norm,
91                line_start: block_idx,
92                line_end: block_idx,
93                location_kind: LocationKind::Blocks,
94                heading_path: block.heading.clone(),
95                location_quality: LocationQuality::Approximate,
96            });
97            block_idx += 1;
98
99            // RFC-044 §9.5: segment count limit.
100            if segments.len() >= limits.max_segments {
101                warnings.push(ExtractWarning::SizeLimitReached {
102                    limit_name: "max_segments".into(),
103                });
104                break;
105            }
106        }
107
108        Ok(ExtractOutput {
109            extractor_name: EXTRACTOR_NAME.to_string(),
110            extractor_version: EXTRACTOR_VERSION.to_string(),
111            normalization_version: NORMALIZATION_VERSION.to_string(),
112            segments,
113            char_count: total_chars,
114            warnings,
115        })
116    }
117
118    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
119        self.extract_with_context(path, &ExtractContext::default())
120    }
121}
122
123struct Block {
124    kind: SegmentKind,
125    text: String,
126    heading: Option<String>,
127}
128
129/// Extract text blocks from HTML using a simple state machine.
130fn extract_blocks(html: &str) -> Vec<Block> {
131    let mut blocks: Vec<Block> = Vec::new();
132    let mut current = String::new();
133    let mut in_tag = false;
134    let mut tag_name = String::new();
135    let mut current_kind = SegmentKind::Paragraph;
136    let mut heading_stack: Vec<String> = Vec::new();
137    let mut current_heading: Option<String> = None;
138    let mut skip_depth = 0u32; // for script/style nesting
139
140    let push_block = |blocks: &mut Vec<Block>, text: &str, kind, heading: Option<String>| {
141        let trimmed = text.trim().to_string();
142        if !trimmed.is_empty() {
143            blocks.push(Block {
144                kind,
145                text: trimmed,
146                heading,
147            });
148        }
149    };
150
151    for ch in html.chars() {
152        if ch == '<' {
153            in_tag = true;
154            tag_name.clear();
155            continue;
156        }
157        if in_tag {
158            if ch == '>' {
159                in_tag = false;
160                let tag = tag_name.trim().to_ascii_lowercase();
161                let (closing, base) = if let Some(b) = tag.strip_prefix('/') {
162                    (true, b.trim().to_string())
163                } else {
164                    (
165                        false,
166                        tag.split_whitespace().next().unwrap_or("").to_string(),
167                    )
168                };
169
170                match base.as_str() {
171                    "script" | "style" => {
172                        if closing {
173                            skip_depth = skip_depth.saturating_sub(1);
174                        } else {
175                            skip_depth += 1;
176                        }
177                    }
178                    "h1" | "h2" | "h3" | "h4" | "h5" | "h6" if !closing => {
179                        push_block(&mut blocks, &current, current_kind, current_heading.clone());
180                        current.clear();
181                        current_kind = SegmentKind::Heading;
182                    }
183                    "h1" | "h2" | "h3" | "h4" | "h5" | "h6" if closing => {
184                        let title = current.trim().to_string();
185                        if !title.is_empty() {
186                            heading_stack.retain(|h| h != &title);
187                            heading_stack.push(title.clone());
188                            current_heading = Some(heading_stack.join(" > "));
189                            blocks.push(Block {
190                                kind: SegmentKind::Heading,
191                                text: title,
192                                heading: current_heading.clone(),
193                            });
194                        }
195                        current.clear();
196                        current_kind = SegmentKind::Paragraph;
197                    }
198                    "p" | "div" | "li" | "td" | "th" | "article" | "section" | "blockquote" => {
199                        if !closing {
200                            push_block(
201                                &mut blocks,
202                                &current,
203                                current_kind,
204                                current_heading.clone(),
205                            );
206                            current.clear();
207                            current_kind = SegmentKind::Paragraph;
208                        }
209                    }
210                    "br" => {
211                        current.push('\n');
212                    }
213                    _ => {}
214                }
215            } else {
216                tag_name.push(ch);
217            }
218            continue;
219        }
220        // Text content.
221        if skip_depth == 0 {
222            current.push(ch);
223        }
224    }
225    push_block(&mut blocks, &current, current_kind, current_heading);
226    blocks
227}