1use crate::normalize::normalize_document;
11use crate::types::{
12 DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
13 LocationKind, LocationQuality, SegmentKind, read_error_category,
14};
15use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17
18const EXTRACTOR_NAME: &str = "html";
19const EXTRACTOR_VERSION: &str = "v1";
20
21pub struct HtmlExtractor;
22
23impl DocumentExtractor for HtmlExtractor {
24 fn name(&self) -> &'static str {
25 EXTRACTOR_NAME
26 }
27
28 fn version(&self) -> &'static str {
29 EXTRACTOR_VERSION
30 }
31
32 fn supported_extensions(&self) -> &'static [&'static str] {
33 &["html", "htm"]
34 }
35
36 fn extract_with_context(
37 &self,
38 path: &ValidatedPath,
39 context: &ExtractContext,
40 ) -> OrbokResult<ExtractOutput> {
41 let limits = &context.limits;
42 let mut warnings = Vec::new();
43
44 let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
46 category: read_error_category(&e),
47 message: e.to_string(),
48 })?;
49 if meta.len() > limits.max_html_bytes {
50 return Err(OrbokError::Extraction {
51 category: ErrorCategory::FileTooLarge,
52 message: format!(
53 "HTML file is {} bytes, limit is {}",
54 meta.len(),
55 limits.max_html_bytes
56 ),
57 });
58 }
59
60 let content =
61 std::fs::read_to_string(&path.canonical).map_err(|e| OrbokError::Extraction {
62 category: read_error_category(&e),
63 message: e.to_string(),
64 })?;
65
66 let blocks = extract_blocks(&content);
67 let mut segments = Vec::new();
68 let mut total_chars = 0u64;
69 let mut block_idx = 1u32;
70
71 for block in &blocks {
72 let norm = normalize_document(&block.text);
73 if norm.trim().is_empty() {
74 block_idx += 1;
75 continue;
76 }
77
78 let block_chars = norm.chars().count() as u64;
80 if total_chars + block_chars > limits.max_extracted_chars {
81 warnings.push(ExtractWarning::SizeLimitReached {
82 limit_name: "max_extracted_chars".into(),
83 });
84 break;
85 }
86 total_chars += block_chars;
87
88 segments.push(ExtractedSegment {
89 kind: block.kind,
90 text: norm,
91 line_start: block_idx,
92 line_end: block_idx,
93 location_kind: LocationKind::Blocks,
94 heading_path: block.heading.clone(),
95 location_quality: LocationQuality::Approximate,
96 });
97 block_idx += 1;
98
99 if segments.len() >= limits.max_segments {
101 warnings.push(ExtractWarning::SizeLimitReached {
102 limit_name: "max_segments".into(),
103 });
104 break;
105 }
106 }
107
108 Ok(ExtractOutput {
109 extractor_name: EXTRACTOR_NAME.to_string(),
110 extractor_version: EXTRACTOR_VERSION.to_string(),
111 normalization_version: NORMALIZATION_VERSION.to_string(),
112 segments,
113 char_count: total_chars,
114 warnings,
115 })
116 }
117
118 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
119 self.extract_with_context(path, &ExtractContext::default())
120 }
121}
122
123struct Block {
124 kind: SegmentKind,
125 text: String,
126 heading: Option<String>,
127}
128
129fn extract_blocks(html: &str) -> Vec<Block> {
131 let mut blocks: Vec<Block> = Vec::new();
132 let mut current = String::new();
133 let mut in_tag = false;
134 let mut tag_name = String::new();
135 let mut current_kind = SegmentKind::Paragraph;
136 let mut heading_stack: Vec<String> = Vec::new();
137 let mut current_heading: Option<String> = None;
138 let mut skip_depth = 0u32; let push_block = |blocks: &mut Vec<Block>, text: &str, kind, heading: Option<String>| {
141 let trimmed = text.trim().to_string();
142 if !trimmed.is_empty() {
143 blocks.push(Block {
144 kind,
145 text: trimmed,
146 heading,
147 });
148 }
149 };
150
151 for ch in html.chars() {
152 if ch == '<' {
153 in_tag = true;
154 tag_name.clear();
155 continue;
156 }
157 if in_tag {
158 if ch == '>' {
159 in_tag = false;
160 let tag = tag_name.trim().to_ascii_lowercase();
161 let (closing, base) = if let Some(b) = tag.strip_prefix('/') {
162 (true, b.trim().to_string())
163 } else {
164 (
165 false,
166 tag.split_whitespace().next().unwrap_or("").to_string(),
167 )
168 };
169
170 match base.as_str() {
171 "script" | "style" => {
172 if closing {
173 skip_depth = skip_depth.saturating_sub(1);
174 } else {
175 skip_depth += 1;
176 }
177 }
178 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" if !closing => {
179 push_block(&mut blocks, ¤t, current_kind, current_heading.clone());
180 current.clear();
181 current_kind = SegmentKind::Heading;
182 }
183 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" if closing => {
184 let title = current.trim().to_string();
185 if !title.is_empty() {
186 heading_stack.retain(|h| h != &title);
187 heading_stack.push(title.clone());
188 current_heading = Some(heading_stack.join(" > "));
189 blocks.push(Block {
190 kind: SegmentKind::Heading,
191 text: title,
192 heading: current_heading.clone(),
193 });
194 }
195 current.clear();
196 current_kind = SegmentKind::Paragraph;
197 }
198 "p" | "div" | "li" | "td" | "th" | "article" | "section" | "blockquote" => {
199 if !closing {
200 push_block(
201 &mut blocks,
202 ¤t,
203 current_kind,
204 current_heading.clone(),
205 );
206 current.clear();
207 current_kind = SegmentKind::Paragraph;
208 }
209 }
210 "br" => {
211 current.push('\n');
212 }
213 _ => {}
214 }
215 } else {
216 tag_name.push(ch);
217 }
218 continue;
219 }
220 if skip_depth == 0 {
222 current.push(ch);
223 }
224 }
225 push_block(&mut blocks, ¤t, current_kind, current_heading);
226 blocks
227}