1use crate::error::{HedlError, HedlResult};
47use crate::limits::Limits;
48use std::borrow::Cow;
49
50#[derive(Debug)]
53pub struct PreprocessedInput {
54 text: String,
56 line_offsets: Vec<(usize, usize, usize)>,
58}
59
60impl PreprocessedInput {
61 #[inline]
63 pub fn lines(&self) -> impl Iterator<Item = (usize, &str)> {
64 self.line_offsets
65 .iter()
66 .map(move |&(num, start, end)| (num, &self.text[start..end]))
67 }
68}
69
70pub fn preprocess(input: &[u8], limits: &Limits) -> HedlResult<PreprocessedInput> {
86 if input.len() > limits.max_file_size {
88 return Err(HedlError::security(
89 format!(
90 "file too large: exceeds limit of {} bytes",
91 limits.max_file_size
92 ),
93 0,
94 ));
95 }
96
97 let text = std::str::from_utf8(input)
99 .map_err(|e| HedlError::syntax(format!("invalid UTF-8 encoding: {}", e), 1))?;
100
101 let text = text.strip_prefix('\u{FEFF}').unwrap_or(text);
103
104 let text: Cow<'_, str> = if text.contains('\r') {
107 let normalized = text.replace("\r\n", "\n");
108 if normalized.contains('\r') {
109 let line_num = normalized[..normalized
111 .find('\r')
112 .expect("contains guarantees CR exists")]
113 .matches('\n')
114 .count()
115 + 1;
116 return Err(HedlError::syntax(
117 "bare CR (U+000D) not allowed - use LF or CRLF",
118 line_num,
119 ));
120 }
121 Cow::Owned(normalized)
122 } else {
123 Cow::Borrowed(text)
124 };
125
126 let text_ref = text.as_ref();
139 let bytes = text_ref.as_bytes();
140
141 let newline_positions: Vec<usize> = memchr::memchr_iter(b'\n', bytes).collect();
143 let mut line_offsets = Vec::with_capacity(newline_positions.len() + 1);
144
145 let mut start = 0;
146 let mut line_num = 1;
147
148 for &newline_pos in &newline_positions {
150 let line_len = newline_pos - start;
152 if line_len > limits.max_line_length {
153 return Err(HedlError::security(
154 format!(
155 "line too long: exceeds limit of {} bytes",
156 limits.max_line_length
157 ),
158 line_num,
159 ));
160 }
161
162 for &b in &bytes[start..newline_pos] {
165 if b < 0x20 && b != 0x09 && b != 0x0D {
166 return Err(HedlError::syntax(
167 format!("control character U+{:04X} not allowed", b),
168 line_num,
169 ));
170 }
171 }
172
173 line_offsets.push((line_num, start, newline_pos));
174 start = newline_pos + 1;
175 line_num += 1;
176 }
177
178 if start <= bytes.len() {
180 let line_len = bytes.len() - start;
181 if line_len > limits.max_line_length {
182 return Err(HedlError::security(
183 format!(
184 "line too long: exceeds limit of {} bytes",
185 limits.max_line_length
186 ),
187 line_num,
188 ));
189 }
190
191 for &b in &bytes[start..] {
193 if b < 0x20 && b != 0x09 && b != 0x0D {
194 return Err(HedlError::syntax(
195 format!("control character U+{:04X} not allowed", b),
196 line_num,
197 ));
198 }
199 }
200
201 line_offsets.push((line_num, start, bytes.len()));
202 }
203
204 let text_owned = text.into_owned();
206
207 Ok(PreprocessedInput {
208 text: text_owned,
209 line_offsets,
210 })
211}
212
213pub fn is_blank_line(line: &str) -> bool {
215 line.trim().is_empty()
216}
217
218pub fn is_comment_line(line: &str) -> bool {
220 line.trim_start().starts_with('#')
221}
222
223#[cfg(test)]
224mod tests {
225 use super::*;
226
227 fn default_limits() -> Limits {
228 Limits::default()
229 }
230
231 #[test]
234 fn test_preprocess_simple() {
235 let input = b"%V:2.0\n%NULL:~\n%QUOTE:\"\n---\na: 1\n";
236 let result = preprocess(input, &default_limits()).unwrap();
237 let lines: Vec<_> = result.lines().collect();
238 assert_eq!(lines.len(), 6); assert_eq!(lines[0], (1, "%V:2.0"));
240 }
241
242 #[test]
243 fn test_preprocess_single_line() {
244 let input = b"hello";
245 let result = preprocess(input, &default_limits()).unwrap();
246 let lines: Vec<_> = result.lines().collect();
247 assert_eq!(lines.len(), 1);
248 assert_eq!(lines[0], (1, "hello"));
249 }
250
251 #[test]
252 fn test_preprocess_empty_input() {
253 let input = b"";
254 let result = preprocess(input, &default_limits()).unwrap();
255 let lines: Vec<_> = result.lines().collect();
256 assert_eq!(lines.len(), 1);
257 assert_eq!(lines[0], (1, ""));
258 }
259
260 #[test]
261 fn test_preprocess_only_newline() {
262 let input = b"\n";
263 let result = preprocess(input, &default_limits()).unwrap();
264 let lines: Vec<_> = result.lines().collect();
265 assert_eq!(lines.len(), 2);
266 assert_eq!(lines[0], (1, ""));
267 assert_eq!(lines[1], (2, ""));
268 }
269
270 #[test]
271 fn test_preprocess_multiple_newlines() {
272 let input = b"\n\n\n";
273 let result = preprocess(input, &default_limits()).unwrap();
274 let lines: Vec<_> = result.lines().collect();
275 assert_eq!(lines.len(), 4);
276 }
277
278 #[test]
279 fn test_preprocess_line_numbers() {
280 let input = b"a\nb\nc\n";
281 let result = preprocess(input, &default_limits()).unwrap();
282 let lines: Vec<_> = result.lines().collect();
283 assert_eq!(lines[0].0, 1);
284 assert_eq!(lines[1].0, 2);
285 assert_eq!(lines[2].0, 3);
286 }
287
288 #[test]
291 fn test_preprocess_crlf() {
292 let input = b"%VERSION: 1.0\r\n---\r\n";
293 let result = preprocess(input, &default_limits()).unwrap();
294 let lines: Vec<_> = result.lines().collect();
295 assert_eq!(lines[0].1, "%VERSION: 1.0");
296 }
297
298 #[test]
299 fn test_preprocess_mixed_line_endings() {
300 let input = b"line1\nline2\r\nline3\n";
301 let result = preprocess(input, &default_limits()).unwrap();
302 let lines: Vec<_> = result.lines().collect();
303 assert_eq!(lines[0].1, "line1");
304 assert_eq!(lines[1].1, "line2");
305 assert_eq!(lines[2].1, "line3");
306 }
307
308 #[test]
309 fn test_preprocess_bare_cr_error() {
310 let input = b"line1\rline2\n";
311 let result = preprocess(input, &default_limits());
312 assert!(result.is_err());
313 assert!(result.unwrap_err().message.contains("bare CR"));
314 }
315
316 #[test]
317 fn test_preprocess_cr_at_end_error() {
318 let input = b"line1\r";
319 let result = preprocess(input, &default_limits());
320 assert!(result.is_err());
321 }
322
323 #[test]
324 fn test_preprocess_crlf_only() {
325 let input = b"\r\n";
326 let result = preprocess(input, &default_limits()).unwrap();
327 let lines: Vec<_> = result.lines().collect();
328 assert_eq!(lines.len(), 2);
329 }
330
331 #[test]
334 fn test_preprocess_bom_skip() {
335 let input = b"\xEF\xBB\xBF%VERSION: 1.0\n---\n";
336 let result = preprocess(input, &default_limits()).unwrap();
337 let lines: Vec<_> = result.lines().collect();
338 assert_eq!(lines[0].1, "%VERSION: 1.0");
339 }
340
341 #[test]
342 fn test_preprocess_bom_only() {
343 let input = b"\xEF\xBB\xBF";
344 let result = preprocess(input, &default_limits()).unwrap();
345 let lines: Vec<_> = result.lines().collect();
346 assert_eq!(lines.len(), 1);
347 assert_eq!(lines[0].1, "");
348 }
349
350 #[test]
351 fn test_preprocess_bom_with_content() {
352 let input = b"\xEF\xBB\xBFhello\n";
353 let result = preprocess(input, &default_limits()).unwrap();
354 let lines: Vec<_> = result.lines().collect();
355 assert_eq!(lines[0].1, "hello");
356 }
357
358 #[test]
361 fn test_preprocess_valid_utf8() {
362 let input = "こんにちは\n".as_bytes();
363 let result = preprocess(input, &default_limits()).unwrap();
364 let lines: Vec<_> = result.lines().collect();
365 assert_eq!(lines[0].1, "こんにちは");
366 }
367
368 #[test]
369 fn test_preprocess_emoji() {
370 let input = "😀🎉🚀\n".as_bytes();
371 let result = preprocess(input, &default_limits()).unwrap();
372 let lines: Vec<_> = result.lines().collect();
373 assert_eq!(lines[0].1, "😀🎉🚀");
374 }
375
376 #[test]
377 fn test_preprocess_invalid_utf8_error() {
378 let input = b"\xFF\xFE";
379 let result = preprocess(input, &default_limits());
380 assert!(result.is_err());
381 assert!(result.unwrap_err().message.contains("UTF-8"));
382 }
383
384 #[test]
385 fn test_preprocess_truncated_utf8_error() {
386 let input = b"\xC0"; let result = preprocess(input, &default_limits());
388 assert!(result.is_err());
389 }
390
391 #[test]
394 fn test_preprocess_tab_allowed() {
395 let input = b"a\tb\tc\n";
396 let result = preprocess(input, &default_limits()).unwrap();
397 let lines: Vec<_> = result.lines().collect();
398 assert!(lines[0].1.contains('\t'));
399 }
400
401 #[test]
402 fn test_preprocess_null_char_error() {
403 let input = b"hello\x00world\n";
404 let result = preprocess(input, &default_limits());
405 assert!(result.is_err());
406 assert!(result.unwrap_err().message.contains("U+0000"));
407 }
408
409 #[test]
410 fn test_preprocess_bell_char_error() {
411 let input = b"hello\x07world\n";
412 let result = preprocess(input, &default_limits());
413 assert!(result.is_err());
414 assert!(result.unwrap_err().message.contains("U+0007"));
415 }
416
417 #[test]
418 fn test_preprocess_backspace_char_error() {
419 let input = b"hello\x08world\n";
420 let result = preprocess(input, &default_limits());
421 assert!(result.is_err());
422 assert!(result.unwrap_err().message.contains("U+0008"));
423 }
424
425 #[test]
426 fn test_preprocess_escape_char_error() {
427 let input = b"hello\x1Bworld\n";
428 let result = preprocess(input, &default_limits());
429 assert!(result.is_err());
430 assert!(result.unwrap_err().message.contains("U+001B"));
431 }
432
433 #[test]
434 fn test_preprocess_control_char_line_number() {
435 let input = b"line1\nline2\x00\n";
436 let result = preprocess(input, &default_limits());
437 assert!(result.is_err());
438 let err = result.unwrap_err();
439 assert_eq!(err.line, 2);
440 }
441
442 #[test]
445 fn test_preprocess_file_size_limit() {
446 let limits = Limits {
447 max_file_size: 10,
448 ..Limits::default()
449 };
450 let input = b"12345678901"; let result = preprocess(input, &limits);
452 assert!(result.is_err());
453 assert!(result.unwrap_err().message.contains("file too large"));
454 }
455
456 #[test]
457 fn test_preprocess_file_size_at_limit() {
458 let limits = Limits {
459 max_file_size: 10,
460 ..Limits::default()
461 };
462 let input = b"1234567890"; let result = preprocess(input, &limits);
464 assert!(result.is_ok());
465 }
466
467 #[test]
468 fn test_preprocess_line_length_limit() {
469 let limits = Limits {
470 max_line_length: 5,
471 ..Limits::default()
472 };
473 let input = b"123456\n"; let result = preprocess(input, &limits);
475 assert!(result.is_err());
476 assert!(result.unwrap_err().message.contains("line too long"));
477 }
478
479 #[test]
480 fn test_preprocess_line_length_at_limit() {
481 let limits = Limits {
482 max_line_length: 5,
483 ..Limits::default()
484 };
485 let input = b"12345\n"; let result = preprocess(input, &limits);
487 assert!(result.is_ok());
488 }
489
490 #[test]
491 fn test_preprocess_last_line_length_limit() {
492 let limits = Limits {
493 max_line_length: 5,
494 ..Limits::default()
495 };
496 let input = b"abc\n123456"; let result = preprocess(input, &limits);
498 assert!(result.is_err());
499 let err = result.unwrap_err();
500 assert_eq!(err.line, 2);
501 }
502
503 #[test]
506 fn test_is_blank_line() {
507 assert!(is_blank_line(""));
508 assert!(is_blank_line(" "));
509 assert!(is_blank_line("\t "));
510 assert!(!is_blank_line("a"));
511 }
512
513 #[test]
514 fn test_is_blank_line_with_tabs() {
515 assert!(is_blank_line("\t"));
516 assert!(is_blank_line("\t\t\t"));
517 assert!(is_blank_line(" \t "));
518 }
519
520 #[test]
521 fn test_is_blank_line_with_content() {
522 assert!(!is_blank_line("x"));
523 assert!(!is_blank_line(" x "));
524 assert!(!is_blank_line("\tx"));
525 }
526
527 #[test]
528 fn test_is_blank_line_unicode() {
529 assert!(is_blank_line(" "));
531 }
532
533 #[test]
536 fn test_is_comment_line() {
537 assert!(is_comment_line("# comment"));
538 assert!(is_comment_line(" # indented comment"));
539 assert!(!is_comment_line("a: 1 # inline"));
540 }
541
542 #[test]
543 fn test_is_comment_line_hash_only() {
544 assert!(is_comment_line("#"));
545 assert!(is_comment_line(" #"));
546 }
547
548 #[test]
549 fn test_is_comment_line_empty_comment() {
550 assert!(is_comment_line("# "));
551 assert!(is_comment_line("#\t"));
552 }
553
554 #[test]
555 fn test_is_comment_line_not_comment() {
556 assert!(!is_comment_line(""));
557 assert!(!is_comment_line(" "));
558 assert!(!is_comment_line("key: value"));
559 assert!(!is_comment_line("key: #value")); }
561
562 #[test]
563 fn test_is_comment_line_with_tabs() {
564 assert!(is_comment_line("\t#comment"));
565 assert!(is_comment_line("\t\t# comment"));
566 }
567
568 #[test]
571 fn test_preprocessed_input_lines_iterator() {
572 let input = b"line1\nline2\nline3\n";
573 let result = preprocess(input, &default_limits()).unwrap();
574 let lines: Vec<_> = result.lines().collect();
575 assert_eq!(lines.len(), 4);
576 }
577
578 #[test]
579 fn test_preprocessed_input_debug() {
580 let input = b"test\n";
581 let result = preprocess(input, &default_limits()).unwrap();
582 let debug = format!("{:?}", result);
583 assert!(debug.contains("PreprocessedInput"));
584 }
585
586 #[test]
589 fn test_preprocess_very_long_line_ok() {
590 let long_line = "x".repeat(1000);
591 let input = format!("{}\n", long_line);
592 let result = preprocess(input.as_bytes(), &default_limits()).unwrap();
593 let lines: Vec<_> = result.lines().collect();
594 assert_eq!(lines[0].1.len(), 1000);
595 }
596
597 #[test]
598 fn test_preprocess_many_lines() {
599 let input = (0..100)
600 .map(|i| format!("line{}", i))
601 .collect::<Vec<_>>()
602 .join("\n");
603 let result = preprocess(input.as_bytes(), &default_limits()).unwrap();
604 let lines: Vec<_> = result.lines().collect();
605 assert_eq!(lines.len(), 100);
606 }
607
608 #[test]
609 fn test_preprocess_trailing_newline_preserved() {
610 let input = b"line\n";
611 let result = preprocess(input, &default_limits()).unwrap();
612 let lines: Vec<_> = result.lines().collect();
613 assert_eq!(lines.len(), 2);
614 assert_eq!(lines[1].1, "");
615 }
616
617 #[test]
618 fn test_preprocess_no_trailing_newline() {
619 let input = b"line";
620 let result = preprocess(input, &default_limits()).unwrap();
621 let lines: Vec<_> = result.lines().collect();
622 assert_eq!(lines.len(), 1);
623 assert_eq!(lines[0].1, "line");
624 }
625}