1use crate::error::{HedlError, HedlResult};
47use crate::limits::Limits;
48use std::borrow::Cow;
49
50#[derive(Debug)]
53pub struct PreprocessedInput {
54 text: String,
56 line_offsets: Vec<(usize, usize, usize)>,
58}
59
60impl PreprocessedInput {
61 #[inline]
63 pub fn lines(&self) -> impl Iterator<Item = (usize, &str)> {
64 self.line_offsets
65 .iter()
66 .map(move |&(num, start, end)| (num, &self.text[start..end]))
67 }
68}
69
70pub fn preprocess(input: &[u8], limits: &Limits) -> HedlResult<PreprocessedInput> {
86 if input.len() > limits.max_file_size {
88 return Err(HedlError::security(
89 format!(
90 "file too large: exceeds limit of {} bytes",
91 limits.max_file_size
92 ),
93 0,
94 ));
95 }
96
97 let text = std::str::from_utf8(input)
99 .map_err(|e| HedlError::syntax(format!("invalid UTF-8 encoding: {}", e), 1))?;
100
101 let text = text.strip_prefix('\u{FEFF}').unwrap_or(text);
103
104 let text: Cow<'_, str> = if text.contains('\r') {
107 let normalized = text.replace("\r\n", "\n");
108 if normalized.contains('\r') {
109 let line_num = normalized[..normalized.find('\r').unwrap()]
110 .matches('\n')
111 .count()
112 + 1;
113 return Err(HedlError::syntax(
114 "bare CR (U+000D) not allowed - use LF or CRLF",
115 line_num,
116 ));
117 }
118 Cow::Owned(normalized)
119 } else {
120 Cow::Borrowed(text)
121 };
122
123 let text_ref = text.as_ref();
136 let bytes = text_ref.as_bytes();
137
138 let newline_positions: Vec<usize> = memchr::memchr_iter(b'\n', bytes).collect();
140 let mut line_offsets = Vec::with_capacity(newline_positions.len() + 1);
141
142 let mut start = 0;
143 let mut line_num = 1;
144
145 for &newline_pos in &newline_positions {
147 let line_len = newline_pos - start;
149 if line_len > limits.max_line_length {
150 return Err(HedlError::security(
151 format!(
152 "line too long: exceeds limit of {} bytes",
153 limits.max_line_length
154 ),
155 line_num,
156 ));
157 }
158
159 for &b in &bytes[start..newline_pos] {
162 if b < 0x20 && b != 0x09 && b != 0x0D {
163 return Err(HedlError::syntax(
164 format!("control character U+{:04X} not allowed", b),
165 line_num,
166 ));
167 }
168 }
169
170 line_offsets.push((line_num, start, newline_pos));
171 start = newline_pos + 1;
172 line_num += 1;
173 }
174
175 if start <= bytes.len() {
177 let line_len = bytes.len() - start;
178 if line_len > limits.max_line_length {
179 return Err(HedlError::security(
180 format!(
181 "line too long: exceeds limit of {} bytes",
182 limits.max_line_length
183 ),
184 line_num,
185 ));
186 }
187
188 for &b in &bytes[start..] {
190 if b < 0x20 && b != 0x09 && b != 0x0D {
191 return Err(HedlError::syntax(
192 format!("control character U+{:04X} not allowed", b),
193 line_num,
194 ));
195 }
196 }
197
198 line_offsets.push((line_num, start, bytes.len()));
199 }
200
201 let text_owned = text.into_owned();
203
204 Ok(PreprocessedInput {
205 text: text_owned,
206 line_offsets,
207 })
208}
209
210pub fn is_blank_line(line: &str) -> bool {
212 line.trim().is_empty()
213}
214
215pub fn is_comment_line(line: &str) -> bool {
217 line.trim_start().starts_with('#')
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 fn default_limits() -> Limits {
225 Limits::default()
226 }
227
228 #[test]
231 fn test_preprocess_simple() {
232 let input = b"%VERSION: 1.0\n---\na: 1\n";
233 let result = preprocess(input, &default_limits()).unwrap();
234 let lines: Vec<_> = result.lines().collect();
235 assert_eq!(lines.len(), 4);
236 assert_eq!(lines[0], (1, "%VERSION: 1.0"));
237 }
238
239 #[test]
240 fn test_preprocess_single_line() {
241 let input = b"hello";
242 let result = preprocess(input, &default_limits()).unwrap();
243 let lines: Vec<_> = result.lines().collect();
244 assert_eq!(lines.len(), 1);
245 assert_eq!(lines[0], (1, "hello"));
246 }
247
248 #[test]
249 fn test_preprocess_empty_input() {
250 let input = b"";
251 let result = preprocess(input, &default_limits()).unwrap();
252 let lines: Vec<_> = result.lines().collect();
253 assert_eq!(lines.len(), 1);
254 assert_eq!(lines[0], (1, ""));
255 }
256
257 #[test]
258 fn test_preprocess_only_newline() {
259 let input = b"\n";
260 let result = preprocess(input, &default_limits()).unwrap();
261 let lines: Vec<_> = result.lines().collect();
262 assert_eq!(lines.len(), 2);
263 assert_eq!(lines[0], (1, ""));
264 assert_eq!(lines[1], (2, ""));
265 }
266
267 #[test]
268 fn test_preprocess_multiple_newlines() {
269 let input = b"\n\n\n";
270 let result = preprocess(input, &default_limits()).unwrap();
271 let lines: Vec<_> = result.lines().collect();
272 assert_eq!(lines.len(), 4);
273 }
274
275 #[test]
276 fn test_preprocess_line_numbers() {
277 let input = b"a\nb\nc\n";
278 let result = preprocess(input, &default_limits()).unwrap();
279 let lines: Vec<_> = result.lines().collect();
280 assert_eq!(lines[0].0, 1);
281 assert_eq!(lines[1].0, 2);
282 assert_eq!(lines[2].0, 3);
283 }
284
285 #[test]
288 fn test_preprocess_crlf() {
289 let input = b"%VERSION: 1.0\r\n---\r\n";
290 let result = preprocess(input, &default_limits()).unwrap();
291 let lines: Vec<_> = result.lines().collect();
292 assert_eq!(lines[0].1, "%VERSION: 1.0");
293 }
294
295 #[test]
296 fn test_preprocess_mixed_line_endings() {
297 let input = b"line1\nline2\r\nline3\n";
298 let result = preprocess(input, &default_limits()).unwrap();
299 let lines: Vec<_> = result.lines().collect();
300 assert_eq!(lines[0].1, "line1");
301 assert_eq!(lines[1].1, "line2");
302 assert_eq!(lines[2].1, "line3");
303 }
304
305 #[test]
306 fn test_preprocess_bare_cr_error() {
307 let input = b"line1\rline2\n";
308 let result = preprocess(input, &default_limits());
309 assert!(result.is_err());
310 assert!(result.unwrap_err().message.contains("bare CR"));
311 }
312
313 #[test]
314 fn test_preprocess_cr_at_end_error() {
315 let input = b"line1\r";
316 let result = preprocess(input, &default_limits());
317 assert!(result.is_err());
318 }
319
320 #[test]
321 fn test_preprocess_crlf_only() {
322 let input = b"\r\n";
323 let result = preprocess(input, &default_limits()).unwrap();
324 let lines: Vec<_> = result.lines().collect();
325 assert_eq!(lines.len(), 2);
326 }
327
328 #[test]
331 fn test_preprocess_bom_skip() {
332 let input = b"\xEF\xBB\xBF%VERSION: 1.0\n---\n";
333 let result = preprocess(input, &default_limits()).unwrap();
334 let lines: Vec<_> = result.lines().collect();
335 assert_eq!(lines[0].1, "%VERSION: 1.0");
336 }
337
338 #[test]
339 fn test_preprocess_bom_only() {
340 let input = b"\xEF\xBB\xBF";
341 let result = preprocess(input, &default_limits()).unwrap();
342 let lines: Vec<_> = result.lines().collect();
343 assert_eq!(lines.len(), 1);
344 assert_eq!(lines[0].1, "");
345 }
346
347 #[test]
348 fn test_preprocess_bom_with_content() {
349 let input = b"\xEF\xBB\xBFhello\n";
350 let result = preprocess(input, &default_limits()).unwrap();
351 let lines: Vec<_> = result.lines().collect();
352 assert_eq!(lines[0].1, "hello");
353 }
354
355 #[test]
358 fn test_preprocess_valid_utf8() {
359 let input = "こんにちは\n".as_bytes();
360 let result = preprocess(input, &default_limits()).unwrap();
361 let lines: Vec<_> = result.lines().collect();
362 assert_eq!(lines[0].1, "こんにちは");
363 }
364
365 #[test]
366 fn test_preprocess_emoji() {
367 let input = "😀🎉🚀\n".as_bytes();
368 let result = preprocess(input, &default_limits()).unwrap();
369 let lines: Vec<_> = result.lines().collect();
370 assert_eq!(lines[0].1, "😀🎉🚀");
371 }
372
373 #[test]
374 fn test_preprocess_invalid_utf8_error() {
375 let input = b"\xFF\xFE";
376 let result = preprocess(input, &default_limits());
377 assert!(result.is_err());
378 assert!(result.unwrap_err().message.contains("UTF-8"));
379 }
380
381 #[test]
382 fn test_preprocess_truncated_utf8_error() {
383 let input = b"\xC0"; let result = preprocess(input, &default_limits());
385 assert!(result.is_err());
386 }
387
388 #[test]
391 fn test_preprocess_tab_allowed() {
392 let input = b"a\tb\tc\n";
393 let result = preprocess(input, &default_limits()).unwrap();
394 let lines: Vec<_> = result.lines().collect();
395 assert!(lines[0].1.contains('\t'));
396 }
397
398 #[test]
399 fn test_preprocess_null_char_error() {
400 let input = b"hello\x00world\n";
401 let result = preprocess(input, &default_limits());
402 assert!(result.is_err());
403 assert!(result.unwrap_err().message.contains("U+0000"));
404 }
405
406 #[test]
407 fn test_preprocess_bell_char_error() {
408 let input = b"hello\x07world\n";
409 let result = preprocess(input, &default_limits());
410 assert!(result.is_err());
411 assert!(result.unwrap_err().message.contains("U+0007"));
412 }
413
414 #[test]
415 fn test_preprocess_backspace_char_error() {
416 let input = b"hello\x08world\n";
417 let result = preprocess(input, &default_limits());
418 assert!(result.is_err());
419 assert!(result.unwrap_err().message.contains("U+0008"));
420 }
421
422 #[test]
423 fn test_preprocess_escape_char_error() {
424 let input = b"hello\x1Bworld\n";
425 let result = preprocess(input, &default_limits());
426 assert!(result.is_err());
427 assert!(result.unwrap_err().message.contains("U+001B"));
428 }
429
430 #[test]
431 fn test_preprocess_control_char_line_number() {
432 let input = b"line1\nline2\x00\n";
433 let result = preprocess(input, &default_limits());
434 assert!(result.is_err());
435 let err = result.unwrap_err();
436 assert_eq!(err.line, 2);
437 }
438
439 #[test]
442 fn test_preprocess_file_size_limit() {
443 let limits = Limits {
444 max_file_size: 10,
445 ..Limits::default()
446 };
447 let input = b"12345678901"; let result = preprocess(input, &limits);
449 assert!(result.is_err());
450 assert!(result.unwrap_err().message.contains("file too large"));
451 }
452
453 #[test]
454 fn test_preprocess_file_size_at_limit() {
455 let limits = Limits {
456 max_file_size: 10,
457 ..Limits::default()
458 };
459 let input = b"1234567890"; let result = preprocess(input, &limits);
461 assert!(result.is_ok());
462 }
463
464 #[test]
465 fn test_preprocess_line_length_limit() {
466 let limits = Limits {
467 max_line_length: 5,
468 ..Limits::default()
469 };
470 let input = b"123456\n"; let result = preprocess(input, &limits);
472 assert!(result.is_err());
473 assert!(result.unwrap_err().message.contains("line too long"));
474 }
475
476 #[test]
477 fn test_preprocess_line_length_at_limit() {
478 let limits = Limits {
479 max_line_length: 5,
480 ..Limits::default()
481 };
482 let input = b"12345\n"; let result = preprocess(input, &limits);
484 assert!(result.is_ok());
485 }
486
487 #[test]
488 fn test_preprocess_last_line_length_limit() {
489 let limits = Limits {
490 max_line_length: 5,
491 ..Limits::default()
492 };
493 let input = b"abc\n123456"; let result = preprocess(input, &limits);
495 assert!(result.is_err());
496 let err = result.unwrap_err();
497 assert_eq!(err.line, 2);
498 }
499
500 #[test]
503 fn test_is_blank_line() {
504 assert!(is_blank_line(""));
505 assert!(is_blank_line(" "));
506 assert!(is_blank_line("\t "));
507 assert!(!is_blank_line("a"));
508 }
509
510 #[test]
511 fn test_is_blank_line_with_tabs() {
512 assert!(is_blank_line("\t"));
513 assert!(is_blank_line("\t\t\t"));
514 assert!(is_blank_line(" \t "));
515 }
516
517 #[test]
518 fn test_is_blank_line_with_content() {
519 assert!(!is_blank_line("x"));
520 assert!(!is_blank_line(" x "));
521 assert!(!is_blank_line("\tx"));
522 }
523
524 #[test]
525 fn test_is_blank_line_unicode() {
526 assert!(is_blank_line(" "));
528 }
529
530 #[test]
533 fn test_is_comment_line() {
534 assert!(is_comment_line("# comment"));
535 assert!(is_comment_line(" # indented comment"));
536 assert!(!is_comment_line("a: 1 # inline"));
537 }
538
539 #[test]
540 fn test_is_comment_line_hash_only() {
541 assert!(is_comment_line("#"));
542 assert!(is_comment_line(" #"));
543 }
544
545 #[test]
546 fn test_is_comment_line_empty_comment() {
547 assert!(is_comment_line("# "));
548 assert!(is_comment_line("#\t"));
549 }
550
551 #[test]
552 fn test_is_comment_line_not_comment() {
553 assert!(!is_comment_line(""));
554 assert!(!is_comment_line(" "));
555 assert!(!is_comment_line("key: value"));
556 assert!(!is_comment_line("key: #value")); }
558
559 #[test]
560 fn test_is_comment_line_with_tabs() {
561 assert!(is_comment_line("\t#comment"));
562 assert!(is_comment_line("\t\t# comment"));
563 }
564
565 #[test]
568 fn test_preprocessed_input_lines_iterator() {
569 let input = b"line1\nline2\nline3\n";
570 let result = preprocess(input, &default_limits()).unwrap();
571 let lines: Vec<_> = result.lines().collect();
572 assert_eq!(lines.len(), 4);
573 }
574
575 #[test]
576 fn test_preprocessed_input_debug() {
577 let input = b"test\n";
578 let result = preprocess(input, &default_limits()).unwrap();
579 let debug = format!("{:?}", result);
580 assert!(debug.contains("PreprocessedInput"));
581 }
582
583 #[test]
586 fn test_preprocess_very_long_line_ok() {
587 let long_line = "x".repeat(1000);
588 let input = format!("{}\n", long_line);
589 let result = preprocess(input.as_bytes(), &default_limits()).unwrap();
590 let lines: Vec<_> = result.lines().collect();
591 assert_eq!(lines[0].1.len(), 1000);
592 }
593
594 #[test]
595 fn test_preprocess_many_lines() {
596 let input = (0..100)
597 .map(|i| format!("line{}", i))
598 .collect::<Vec<_>>()
599 .join("\n");
600 let result = preprocess(input.as_bytes(), &default_limits()).unwrap();
601 let lines: Vec<_> = result.lines().collect();
602 assert_eq!(lines.len(), 100);
603 }
604
605 #[test]
606 fn test_preprocess_trailing_newline_preserved() {
607 let input = b"line\n";
608 let result = preprocess(input, &default_limits()).unwrap();
609 let lines: Vec<_> = result.lines().collect();
610 assert_eq!(lines.len(), 2);
611 assert_eq!(lines[1].1, "");
612 }
613
614 #[test]
615 fn test_preprocess_no_trailing_newline() {
616 let input = b"line";
617 let result = preprocess(input, &default_limits()).unwrap();
618 let lines: Vec<_> = result.lines().collect();
619 assert_eq!(lines.len(), 1);
620 assert_eq!(lines[0].1, "line");
621 }
622}