1use crate::locale;
2use crate::simd_compare::SIMDCompare;
3use memmap2::Mmap;
4use std::cmp::Ordering;
5use std::fs::File;
6use std::io::{self, BufRead, BufReader};
7use std::path::Path;
8
9#[derive(Debug, Clone, Copy)]
11pub struct Line {
12 start: *const u8,
14 len: u32,
16}
17
18unsafe impl Send for Line {}
23unsafe impl Sync for Line {}
24
25impl Line {
26 pub fn new(data: &[u8]) -> Self {
28 Self {
29 start: data.as_ptr(),
30 len: data.len() as u32,
31 }
32 }
33
34 pub unsafe fn as_bytes(&self) -> &[u8] {
41 unsafe { std::slice::from_raw_parts(self.start, self.len as usize) }
44 }
45
46 pub fn extract_field(&self, field_num: usize, separator: Option<char>) -> Option<&[u8]> {
49 if field_num == 0 {
50 return None;
51 }
52
53 let bytes = unsafe { self.as_bytes() };
54
55 if separator.is_none() {
57 return self.extract_field_by_whitespace(bytes, field_num);
58 }
59
60 let sep_byte = separator.unwrap() as u8;
61 let mut field_count = 1;
62 let mut field_start = 0;
63
64 for (i, &byte) in bytes.iter().enumerate() {
65 if byte == sep_byte {
66 if field_count == field_num {
67 return Some(&bytes[field_start..i]);
68 }
69 field_count += 1;
70 field_start = i + 1;
71 }
72 }
73
74 if field_count == field_num && field_start < bytes.len() {
76 return Some(&bytes[field_start..]);
77 }
78
79 None
80 }
81
82 fn extract_field_by_whitespace<'a>(
85 &self,
86 bytes: &'a [u8],
87 field_num: usize,
88 ) -> Option<&'a [u8]> {
89 if field_num == 1 {
90 let mut field_start = 0;
93 for (i, &byte) in bytes.iter().enumerate() {
94 if byte != b' ' && byte != b'\t' {
95 field_start = i;
96 break;
97 }
98 }
99
100 for (i, &byte) in bytes[field_start..].iter().enumerate() {
102 if byte == b' ' || byte == b'\t' {
103 return Some(&bytes[field_start..field_start + i]);
104 }
105 }
106 return Some(&bytes[field_start..]); }
108
109 let mut field_boundaries = Vec::new();
112 let mut in_field = false;
113 let mut field_start = 0;
114
115 for (i, &byte) in bytes.iter().enumerate() {
116 let is_whitespace = byte == b' ' || byte == b'\t';
117
118 if !is_whitespace && !in_field {
119 field_start = i;
121 in_field = true;
122 } else if is_whitespace && in_field {
123 field_boundaries.push(field_start..i);
125 in_field = false;
126 }
127 }
128
129 if in_field {
131 field_boundaries.push(field_start..bytes.len());
132 }
133
134 if field_num > field_boundaries.len() {
135 return None;
136 }
137
138 let target_field = &field_boundaries[field_num - 1];
139
140 if field_num == 1 {
142 return Some(&bytes[target_field.clone()]);
143 }
144
145 let prev_field_end = if field_num > 1 {
148 field_boundaries[field_num - 2].end
149 } else {
150 0
151 };
152
153 Some(&bytes[prev_field_end..target_field.end])
155 }
156
157 pub fn extract_key(
159 &self,
160 key: &crate::config::SortKey,
161 separator: Option<char>,
162 ) -> Option<&[u8]> {
163 let start_field_data = self.extract_field(key.start_field, separator)?;
165
166 if key.end_field.is_none() {
168 if let Some(start_char) = key.start_char {
170 if start_char > 0 && start_char <= start_field_data.len() {
171 return Some(&start_field_data[start_char - 1..]);
172 }
173 }
174 return Some(start_field_data);
175 }
176
177 let bytes = unsafe { self.as_bytes() };
181
182 let start_pos = if let Some(field_data) = self.extract_field(key.start_field, separator) {
184 let offset = field_data.as_ptr() as usize - bytes.as_ptr() as usize;
185 if let Some(start_char) = key.start_char {
186 if start_char > 0 && start_char <= field_data.len() {
187 offset + start_char - 1
188 } else {
189 offset
190 }
191 } else {
192 offset
193 }
194 } else {
195 return None;
196 };
197
198 let end_pos = if let Some(end_field) = key.end_field {
200 if let Some(field_data) = self.extract_field(end_field, separator) {
201 let offset = field_data.as_ptr() as usize - bytes.as_ptr() as usize;
202 let field_end = offset + field_data.len();
203 if let Some(end_char) = key.end_char {
204 if end_char > 0 && end_char <= field_data.len() {
205 offset + end_char
206 } else {
207 field_end
208 }
209 } else {
210 field_end
211 }
212 } else {
213 bytes.len()
214 }
215 } else {
216 bytes.len()
217 };
218
219 if start_pos < end_pos && start_pos < bytes.len() {
220 Some(&bytes[start_pos..end_pos.min(bytes.len())])
221 } else {
222 None
223 }
224 }
225
226 pub fn parse_int(&self) -> Option<i64> {
228 let bytes = unsafe { self.as_bytes() };
231 if bytes.is_empty() {
232 return Some(0);
233 }
234
235 let mut start = 0;
236 let negative = if bytes[0] == b'-' {
237 start = 1;
238 true
239 } else {
240 false
241 };
242
243 if start >= bytes.len() {
244 return None;
245 }
246
247 let mut result: i64 = 0;
248 for &byte in &bytes[start..] {
249 if !byte.is_ascii_digit() {
250 return None;
251 }
252 result = result.checked_mul(10)?;
253 result = result.checked_add((byte - b'0') as i64)?;
254 }
255
256 Some(if negative { -result } else { result })
257 }
258
259 pub fn parse_general_numeric(&self) -> f64 {
261 let bytes = unsafe { self.as_bytes() };
262 if let Ok(s) = std::str::from_utf8(bytes) {
263 let trimmed = s.trim();
264
265 if trimmed.is_empty() {
267 return 0.0;
268 }
269
270 match trimmed.parse::<f64>() {
272 Ok(val) => val,
273 Err(_) => {
274 let lower = trimmed.to_lowercase();
276 if lower == "inf" || lower == "+inf" || lower == "infinity" {
277 f64::INFINITY
278 } else if lower == "-inf" || lower == "-infinity" {
279 f64::NEG_INFINITY
280 } else if lower == "nan" {
281 f64::NAN
282 } else {
283 f64::NEG_INFINITY
285 }
286 }
287 }
288 } else {
289 f64::NEG_INFINITY
290 }
291 }
292
293 pub fn compare_general_numeric(&self, other: &Line) -> Ordering {
295 let a = self.parse_general_numeric();
296 let b = other.parse_general_numeric();
297
298 match (a.is_nan(), b.is_nan()) {
300 (true, true) => unsafe { self.as_bytes().cmp(other.as_bytes()) }, (true, false) => Ordering::Greater,
302 (false, true) => Ordering::Less,
303 (false, false) => {
304 match a.total_cmp(&b) {
306 Ordering::Equal => {
307 unsafe { self.as_bytes().cmp(other.as_bytes()) }
310 }
311 other => other,
312 }
313 }
314 }
315 }
316
317 pub fn compare_with_keys(
319 &self,
320 other: &Line,
321 keys: &[crate::config::SortKey],
322 separator: Option<char>,
323 config: &crate::config::SortConfig,
324 ) -> Ordering {
325 if keys.is_empty() {
326 return self.compare_with_config(other, config);
328 }
329
330 for key in keys {
332 let self_field = self.extract_key(key, separator);
333 let other_field = other.extract_key(key, separator);
334
335 let cmp = match (self_field, other_field) {
336 (Some(a), Some(b)) => {
337 let a_line = Line::new(a);
339 let b_line = Line::new(b);
340
341 let result = if key.options.general_numeric {
343 a_line.compare_general_numeric(&b_line)
344 } else if key.options.numeric {
345 a_line.compare_numeric(&b_line)
346 } else if key.options.month {
347 a_line.compare_month(&b_line)
348 } else if key.options.version {
349 a_line.compare_version(&b_line)
350 } else if key.options.human_numeric {
351 a_line.compare_human_numeric(&b_line)
352 } else if key.options.dictionary_order && key.options.ignore_case {
353 a_line.compare_dictionary_order_ignore_case(&b_line)
354 } else if key.options.dictionary_order {
355 a_line.compare_dictionary_order(&b_line)
356 } else if key.options.ignore_case {
357 a_line.compare_ignore_case(&b_line)
358 } else if key.options.ignore_leading_blanks {
359 a_line.compare_lexicographic_with_blanks(&b_line, true)
360 } else {
361 a_line.compare_lexicographic(&b_line)
362 };
363
364 let final_result = if key.options.reverse {
366 result.reverse()
367 } else {
368 result
369 };
370
371 if config.debug {
373 let self_bytes = unsafe { self.as_bytes() };
374 let other_bytes = unsafe { other.as_bytes() };
375 let self_str = String::from_utf8_lossy(self_bytes);
376 let other_str = String::from_utf8_lossy(other_bytes);
377 let a_str = String::from_utf8_lossy(a);
378 let b_str = String::from_utf8_lossy(b);
379
380 let cmp_val = match final_result {
382 Ordering::Greater => 1,
383 Ordering::Less => -1,
384 Ordering::Equal => 0,
385 };
386
387 eprintln!("; k1=<{a_str}>; k2=<{b_str}>; s1=<{self_str}>, s2=<{other_str}>; cmp1={cmp_val}");
388 }
389
390 final_result
391 }
392 (None, Some(_)) => Ordering::Less,
393 (Some(_), None) => Ordering::Greater,
394 (None, None) => Ordering::Equal,
395 };
396
397 if cmp != Ordering::Equal {
398 return cmp;
399 }
400 }
401
402 if config.stable {
404 Ordering::Equal
405 } else {
406 self.compare_lexicographic(other)
408 }
409 }
410
411 pub fn compare_with_config(
413 &self,
414 other: &Line,
415 config: &crate::config::SortConfig,
416 ) -> Ordering {
417 let cmp = match config.mode {
418 crate::config::SortMode::GeneralNumeric => self.compare_general_numeric(other),
419 crate::config::SortMode::Numeric => self.compare_numeric(other),
420 crate::config::SortMode::Month => self.compare_month(other),
421 crate::config::SortMode::Version => self.compare_version(other),
422 crate::config::SortMode::HumanNumeric => self.compare_human_numeric(other),
423 crate::config::SortMode::Lexicographic => {
424 if config.dictionary_order && config.ignore_case {
425 self.compare_dictionary_order_ignore_case(other)
426 } else if config.dictionary_order {
427 self.compare_dictionary_order(other)
428 } else if config.ignore_case {
429 self.compare_ignore_case(other)
430 } else if config.ignore_leading_blanks {
431 self.compare_lexicographic_with_blanks(other, true)
432 } else {
433 self.compare_lexicographic(other)
434 }
435 }
436 _ => {
437 if config.dictionary_order {
439 self.compare_dictionary_order(other)
440 } else if config.ignore_leading_blanks {
441 self.compare_lexicographic_with_blanks(other, true)
442 } else {
443 self.compare_lexicographic(other)
444 }
445 }
446 };
447
448 if config.reverse {
449 cmp.reverse()
450 } else {
451 cmp
452 }
453 }
454
455 pub fn compare_numeric(&self, other: &Line) -> Ordering {
457 if let (Some(a), Some(b)) = (self.parse_int(), other.parse_int()) {
459 return a.cmp(&b);
460 }
461
462 self.compare_numeric_string_style(other)
464 }
465
466 fn compare_numeric_string_style(&self, other: &Line) -> Ordering {
468 let a_bytes = unsafe { self.as_bytes() };
469 let b_bytes = unsafe { other.as_bytes() };
470
471 let a_start = self.skip_leading_space(a_bytes);
473 let b_start = self.skip_leading_space(b_bytes);
474
475 if a_start >= a_bytes.len() && b_start >= b_bytes.len() {
476 return Ordering::Equal;
477 }
478 if a_start >= a_bytes.len() {
479 return Ordering::Less;
480 }
481 if b_start >= b_bytes.len() {
482 return Ordering::Greater;
483 }
484
485 let a_rest = &a_bytes[a_start..];
486 let b_rest = &b_bytes[b_start..];
487
488 let (a_negative, a_num_start) = self.parse_sign(a_rest);
490 let (b_negative, b_num_start) = self.parse_sign(b_rest);
491
492 match (a_negative, b_negative) {
493 (true, false) => Ordering::Less,
494 (false, true) => Ordering::Greater,
495 _ => {
496 let a_digits = &a_rest[a_num_start..];
498 let b_digits = &b_rest[b_num_start..];
499
500 let a_no_zeros = self.skip_leading_zeros(a_digits);
502 let b_no_zeros = self.skip_leading_zeros(b_digits);
503
504 let a_digit_count = self.count_leading_digits(&a_digits[a_no_zeros..]);
506 let b_digit_count = self.count_leading_digits(&b_digits[b_no_zeros..]);
507
508 let magnitude_cmp = match a_digit_count.cmp(&b_digit_count) {
509 Ordering::Equal => {
510 a_digits[a_no_zeros..a_no_zeros + a_digit_count]
512 .cmp(&b_digits[b_no_zeros..b_no_zeros + b_digit_count])
513 }
514 other => other,
515 };
516
517 if a_negative {
518 magnitude_cmp.reverse()
519 } else {
520 magnitude_cmp
521 }
522 }
523 }
524 }
525
526 fn skip_leading_space(&self, bytes: &[u8]) -> usize {
527 bytes
528 .iter()
529 .position(|&b| b != b' ' && b != b'\t')
530 .unwrap_or(bytes.len())
531 }
532
533 fn parse_sign(&self, bytes: &[u8]) -> (bool, usize) {
534 if bytes.is_empty() {
535 return (false, 0);
536 }
537 match bytes[0] {
538 b'-' => (true, 1),
539 b'+' => (false, 1),
540 _ => (false, 0),
541 }
542 }
543
544 fn skip_leading_zeros(&self, bytes: &[u8]) -> usize {
545 bytes.iter().position(|&b| b != b'0').unwrap_or(bytes.len())
546 }
547
548 fn count_leading_digits(&self, bytes: &[u8]) -> usize {
549 bytes.iter().take_while(|&&b| b.is_ascii_digit()).count()
550 }
551
552 #[allow(dead_code)]
554 fn compare_numeric_bytes(&self, other: &Line) -> Ordering {
555 let a_bytes = unsafe { self.as_bytes() };
556 let b_bytes = unsafe { other.as_bytes() };
557
558 let a_trimmed = a_bytes
560 .iter()
561 .skip_while(|&&b| b == b' ' || b == b'\t')
562 .collect::<Vec<_>>();
563 let b_trimmed = b_bytes
564 .iter()
565 .skip_while(|&&b| b == b' ' || b == b'\t')
566 .collect::<Vec<_>>();
567
568 if a_trimmed.is_empty() && b_trimmed.is_empty() {
569 return Ordering::Equal;
570 }
571 if a_trimmed.is_empty() {
572 return Ordering::Less;
573 }
574 if b_trimmed.is_empty() {
575 return Ordering::Greater;
576 }
577
578 let a_negative = *a_trimmed[0] == b'-';
580 let b_negative = *b_trimmed[0] == b'-';
581
582 match (a_negative, b_negative) {
583 (true, false) => Ordering::Less,
584 (false, true) => Ordering::Greater,
585 _ => {
586 let a_digits: Vec<u8> = a_trimmed
588 .iter()
589 .skip_while(|&&&b| b == b'-' || b == b'+')
590 .filter(|&&&b| b.is_ascii_digit())
591 .map(|&&b| b)
592 .collect();
593 let b_digits: Vec<u8> = b_trimmed
594 .iter()
595 .skip_while(|&&&b| b == b'-' || b == b'+')
596 .filter(|&&&b| b.is_ascii_digit())
597 .map(|&&b| b)
598 .collect();
599
600 let magnitude_cmp = match a_digits.len().cmp(&b_digits.len()) {
601 Ordering::Equal => a_digits.cmp(&b_digits),
602 other => other,
603 };
604
605 if a_negative {
606 magnitude_cmp.reverse()
607 } else {
608 magnitude_cmp
609 }
610 }
611 }
612 }
613
614 pub fn len(&self) -> usize {
616 self.len as usize
617 }
618
619 pub fn is_empty(&self) -> bool {
621 self.len == 0
622 }
623
624 pub fn compare_ignore_case(&self, other: &Line) -> Ordering {
626 let a_bytes = unsafe { self.as_bytes() };
627 let b_bytes = unsafe { other.as_bytes() };
628
629 if locale::LocaleConfig::is_enabled() {
631 locale::smart_compare(a_bytes, b_bytes, true)
632 } else {
633 SIMDCompare::compare_case_insensitive_simd(a_bytes, b_bytes)
635 }
636 }
637
638 pub fn compare_lexicographic(&self, other: &Line) -> Ordering {
640 let a_bytes = unsafe { self.as_bytes() };
641 let b_bytes = unsafe { other.as_bytes() };
642
643 if locale::LocaleConfig::is_enabled() {
645 locale::smart_compare(a_bytes, b_bytes, false)
646 } else {
647 SIMDCompare::compare_bytes_simd(a_bytes, b_bytes)
649 }
650 }
651
652 pub fn compare_lexicographic_with_blanks(
654 &self,
655 other: &Line,
656 ignore_leading_blanks: bool,
657 ) -> Ordering {
658 let mut a_bytes = unsafe { self.as_bytes() };
659 let mut b_bytes = unsafe { other.as_bytes() };
660
661 if ignore_leading_blanks {
662 let a_start = a_bytes
664 .iter()
665 .position(|&b| b != b' ' && b != b'\t')
666 .unwrap_or(a_bytes.len());
667 let b_start = b_bytes
668 .iter()
669 .position(|&b| b != b' ' && b != b'\t')
670 .unwrap_or(b_bytes.len());
671 a_bytes = &a_bytes[a_start..];
672 b_bytes = &b_bytes[b_start..];
673 }
674
675 if locale::LocaleConfig::is_enabled() {
677 locale::smart_compare(a_bytes, b_bytes, false)
678 } else {
679 SIMDCompare::compare_bytes_simd(a_bytes, b_bytes)
681 }
682 }
683
684 pub fn compare_dictionary_order(&self, other: &Line) -> Ordering {
686 let a_bytes = unsafe { self.as_bytes() };
687 let b_bytes = unsafe { other.as_bytes() };
688
689 let a_filtered = self.filter_dictionary_order(a_bytes);
690 let b_filtered = self.filter_dictionary_order(b_bytes);
691
692 if locale::LocaleConfig::is_enabled() {
694 locale::smart_compare(&a_filtered, &b_filtered, false)
695 } else {
696 SIMDCompare::compare_bytes_simd(&a_filtered, &b_filtered)
698 }
699 }
700
701 pub fn compare_dictionary_order_ignore_case(&self, other: &Line) -> Ordering {
703 let a_bytes = unsafe { self.as_bytes() };
704 let b_bytes = unsafe { other.as_bytes() };
705
706 let a_filtered = self.filter_dictionary_order(a_bytes);
707 let b_filtered = self.filter_dictionary_order(b_bytes);
708
709 if locale::LocaleConfig::is_enabled() {
711 locale::smart_compare(&a_filtered, &b_filtered, true)
712 } else {
713 SIMDCompare::compare_case_insensitive_simd(&a_filtered, &b_filtered)
715 }
716 }
717
718 fn filter_dictionary_order(&self, bytes: &[u8]) -> Vec<u8> {
721 if let Ok(s) = std::str::from_utf8(bytes) {
723 s.chars()
724 .filter(|c| c.is_alphanumeric() || *c == ' ' || *c == '\t')
725 .collect::<String>()
726 .into_bytes()
727 } else {
728 bytes
730 .iter()
731 .filter(|&&b| b.is_ascii_alphanumeric() || b == b' ' || b == b'\t')
732 .copied()
733 .collect()
734 }
735 }
736
737 pub fn compare_month(&self, other: &Line) -> Ordering {
739 let a_bytes = unsafe { self.as_bytes() };
740 let b_bytes = unsafe { other.as_bytes() };
741
742 fn month_value(bytes: &[u8]) -> u8 {
743 let upper_bytes: Vec<u8> = bytes.iter().map(|b| b.to_ascii_uppercase()).collect();
745
746 match upper_bytes.as_slice() {
748 b"JAN" | b"JANUARY" => 1,
749 b"FEB" | b"FEBRUARY" => 2,
750 b"MAR" | b"MARCH" => 3,
751 b"APR" | b"APRIL" => 4,
752 b"MAY" => 5,
753 b"JUN" | b"JUNE" => 6,
754 b"JUL" | b"JULY" => 7,
755 b"AUG" | b"AUGUST" => 8,
756 b"SEP" | b"SEPTEMBER" => 9,
757 b"OCT" | b"OCTOBER" => 10,
758 b"NOV" | b"NOVEMBER" => 11,
759 b"DEC" | b"DECEMBER" => 12,
760 _ => 0, }
762 }
763
764 let a_month = month_value(a_bytes);
765 let b_month = month_value(b_bytes);
766
767 match (a_month, b_month) {
768 (a, b) if a > 0 && b > 0 => a.cmp(&b),
770 (a, 0) if a > 0 => Ordering::Greater,
772 (0, b) if b > 0 => Ordering::Less,
774 (0, 0) => self.compare_lexicographic(other),
776 _ => self.compare_lexicographic(other),
778 }
779 }
780
781 pub fn compare_version(&self, other: &Line) -> Ordering {
783 let a_bytes = unsafe { self.as_bytes() };
784 let b_bytes = unsafe { other.as_bytes() };
785
786 let a_str = String::from_utf8_lossy(a_bytes);
788 let b_str = String::from_utf8_lossy(b_bytes);
789
790 Self::compare_version_strings(&a_str, &b_str)
791 }
792
793 fn compare_version_strings(a: &str, b: &str) -> Ordering {
795 let a_parts = Self::version_tokenize(a);
797 let b_parts = Self::version_tokenize(b);
798
799 for (a_part, b_part) in a_parts.iter().zip(b_parts.iter()) {
800 match Self::compare_version_component(a_part, b_part) {
801 Ordering::Equal => continue,
802 other => return other,
803 }
804 }
805
806 a_parts.len().cmp(&b_parts.len())
808 }
809
810 fn version_tokenize(s: &str) -> Vec<String> {
812 let mut tokens = Vec::new();
813 let mut current = String::new();
814 let mut in_alpha = false;
815
816 for ch in s.chars() {
817 let is_alpha = ch.is_alphabetic();
818 let is_digit = ch.is_ascii_digit();
819
820 if is_alpha || is_digit {
821 if in_alpha != is_alpha && !current.is_empty() {
822 tokens.push(current);
823 current = String::new();
824 }
825 current.push(ch);
826 in_alpha = is_alpha;
827 } else if !current.is_empty() {
828 tokens.push(current);
829 current = String::new();
830 }
831 }
832
833 if !current.is_empty() {
834 tokens.push(current);
835 }
836
837 tokens
838 }
839
840 fn compare_version_component(a: &str, b: &str) -> Ordering {
842 if let (Ok(a_num), Ok(b_num)) = (a.parse::<u64>(), b.parse::<u64>()) {
844 return a_num.cmp(&b_num);
845 }
846
847 match (
849 a.chars().all(|c| c.is_ascii_digit()),
850 b.chars().all(|c| c.is_ascii_digit()),
851 ) {
852 (true, false) => Ordering::Less,
853 (false, true) => Ordering::Greater,
854 _ => a.cmp(b), }
856 }
857
858 pub fn compare_human_numeric(&self, other: &Line) -> Ordering {
860 let a_bytes = unsafe { self.as_bytes() };
861 let b_bytes = unsafe { other.as_bytes() };
862
863 let a_string = String::from_utf8_lossy(a_bytes);
864 let b_string = String::from_utf8_lossy(b_bytes);
865 let a_str = a_string.trim();
866 let b_str = b_string.trim();
867
868 let a_val = Self::parse_human_numeric(a_str);
869 let b_val = Self::parse_human_numeric(b_str);
870
871 match (a_val, b_val) {
872 (Some(a), Some(b)) => {
873 match a.partial_cmp(&b) {
874 Some(ord) => ord,
875 None => a_str.cmp(b_str), }
877 }
878 (Some(_), None) => Ordering::Less, (None, Some(_)) => Ordering::Greater, (None, None) => a_str.cmp(b_str), }
882 }
883
884 fn parse_human_numeric(s: &str) -> Option<f64> {
886 if s.is_empty() {
887 return None;
888 }
889
890 let s = s.trim();
891 let last_char = s.chars().last()?;
892
893 let multiplier = match last_char.to_ascii_uppercase() {
894 'K' => 1024.0,
895 'M' => 1024.0 * 1024.0,
896 'G' => 1024.0 * 1024.0 * 1024.0,
897 'T' => 1024.0 * 1024.0 * 1024.0 * 1024.0,
898 'P' => 1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024.0,
899 _ => {
900 return s.parse::<f64>().ok();
902 }
903 };
904
905 let numeric_part = s[..s.len() - 1].trim();
907 let value = numeric_part.parse::<f64>().ok()?;
908
909 Some(value * multiplier)
910 }
911}
912
913pub struct MappedFile {
915 _mmap: Mmap, lines: Vec<Line>,
917}
918
919impl MappedFile {
920 pub fn new(path: &Path) -> io::Result<Self> {
922 let file = File::open(path)?;
923 let mmap = unsafe { Mmap::map(&file)? };
924
925 let lines = parse_lines(&mmap);
927
928 Ok(Self { _mmap: mmap, lines })
929 }
930
931 pub fn lines(&self) -> &[Line] {
933 &self.lines
934 }
935}
936
937fn parse_lines(data: &[u8]) -> Vec<Line> {
939 let mut lines = Vec::new();
940 let mut start = 0;
941
942 for (i, &byte) in data.iter().enumerate() {
943 if byte == b'\n' {
944 let end = if i > 0 && data[i - 1] == b'\r' {
946 i - 1
947 } else {
948 i
949 };
950 let line_data = &data[start..end];
951 lines.push(Line::new(line_data));
952 start = i + 1;
953 }
954 }
955
956 if start < data.len() {
958 let mut end = data.len();
959 if end > start && data[end - 1] == b'\r' {
961 end -= 1;
962 }
963 let line_data = &data[start..end];
964 lines.push(Line::new(line_data));
965 }
966
967 lines
968}
969
970pub struct ZeroCopyReader {
972 reader: BufReader<File>,
973 buffer: Vec<u8>,
974 lines: Vec<Line>,
975}
976
977impl ZeroCopyReader {
978 pub fn new(file: File) -> Self {
979 Self {
980 reader: BufReader::new(file),
981 buffer: Vec::with_capacity(64 * 1024), lines: Vec::new(),
983 }
984 }
985
986 pub fn read_chunk(&mut self) -> io::Result<&[Line]> {
988 self.buffer.clear();
989 self.lines.clear();
990
991 let mut total_read = 0;
992 const CHUNK_SIZE: usize = 64 * 1024;
993
994 while total_read < CHUNK_SIZE {
996 let mut line_buf = Vec::new();
997 let bytes_read = self.reader.read_until(b'\n', &mut line_buf)?;
998
999 if bytes_read == 0 {
1000 break; }
1002
1003 let start_idx = self.buffer.len();
1004 self.buffer.extend_from_slice(&line_buf);
1005
1006 let end_idx = if line_buf.ends_with(b"\n") {
1008 self.buffer.len() - 1
1009 } else {
1010 self.buffer.len()
1011 };
1012
1013 let line_data = &self.buffer[start_idx..end_idx];
1014 self.lines.push(Line::new(line_data));
1015
1016 total_read += bytes_read;
1017 }
1018
1019 Ok(&self.lines)
1020 }
1021}
1022
1023pub fn compare_numeric_lines(a: &Line, b: &Line) -> Ordering {
1025 unsafe {
1026 let a_bytes = a.as_bytes();
1027 let b_bytes = b.as_bytes();
1028
1029 if let (Some(a_num), Some(b_num)) = (parse_int(a_bytes), parse_int(b_bytes)) {
1031 return a_num.cmp(&b_num);
1032 }
1033
1034 compare_numeric_bytes(a_bytes, b_bytes)
1036 }
1037}
1038
1039fn parse_int(bytes: &[u8]) -> Option<i64> {
1041 if bytes.is_empty() {
1042 return Some(0);
1043 }
1044
1045 let mut result: i64 = 0;
1046 let mut negative = false;
1047 let mut start = 0;
1048
1049 if bytes[0] == b'-' {
1051 negative = true;
1052 start = 1;
1053 } else if bytes[0] == b'+' {
1054 start = 1;
1055 }
1056
1057 for &byte in &bytes[start..] {
1059 if !byte.is_ascii_digit() {
1060 return None; }
1062
1063 result = result.checked_mul(10)?;
1064 result = result.checked_add((byte - b'0') as i64)?;
1065 }
1066
1067 if negative {
1068 result = -result;
1069 }
1070
1071 Some(result)
1072}
1073
1074fn compare_numeric_bytes(a: &[u8], b: &[u8]) -> Ordering {
1076 let a = skip_whitespace(a);
1078 let b = skip_whitespace(b);
1079
1080 match (a.is_empty(), b.is_empty()) {
1082 (true, true) => return Ordering::Equal,
1083 (true, false) => return Ordering::Less,
1084 (false, true) => return Ordering::Greater,
1085 (false, false) => {
1086 }
1088 }
1089
1090 let (a_negative, a_digits) = extract_sign(a);
1092 let (b_negative, b_digits) = extract_sign(b);
1093
1094 match (a_negative, b_negative) {
1096 (false, true) => return Ordering::Greater,
1097 (true, false) => return Ordering::Less,
1098 _ => {}
1099 }
1100
1101 let magnitude_cmp = compare_magnitude(a_digits, b_digits);
1103
1104 if a_negative {
1105 magnitude_cmp.reverse()
1106 } else {
1107 magnitude_cmp
1108 }
1109}
1110
1111fn skip_whitespace(bytes: &[u8]) -> &[u8] {
1112 let start = bytes
1113 .iter()
1114 .position(|&b| !b.is_ascii_whitespace())
1115 .unwrap_or(bytes.len());
1116 &bytes[start..]
1117}
1118
1119fn extract_sign(bytes: &[u8]) -> (bool, &[u8]) {
1120 if bytes.starts_with(b"-") {
1121 (true, &bytes[1..])
1122 } else if bytes.starts_with(b"+") {
1123 (false, &bytes[1..])
1124 } else {
1125 (false, bytes)
1126 }
1127}
1128
1129fn compare_magnitude(a: &[u8], b: &[u8]) -> Ordering {
1130 let a_dot = a.iter().position(|&b| b == b'.');
1132 let b_dot = b.iter().position(|&b| b == b'.');
1133
1134 let (a_int, a_frac) = match a_dot {
1135 Some(pos) => (&a[..pos], &a[pos + 1..]),
1136 None => (a, &[][..]),
1137 };
1138
1139 let (b_int, b_frac) = match b_dot {
1140 Some(pos) => (&b[..pos], &b[pos + 1..]),
1141 None => (b, &[][..]),
1142 };
1143
1144 let int_cmp = compare_integer_parts(a_int, b_int);
1146 if int_cmp != Ordering::Equal {
1147 return int_cmp;
1148 }
1149
1150 compare_fractional_parts(a_frac, b_frac)
1152}
1153
1154fn compare_integer_parts(a: &[u8], b: &[u8]) -> Ordering {
1155 let a = skip_leading_zeros(a);
1157 let b = skip_leading_zeros(b);
1158
1159 let len_cmp = a.len().cmp(&b.len());
1161 if len_cmp != Ordering::Equal {
1162 return len_cmp;
1163 }
1164
1165 a.cmp(b)
1167}
1168
1169fn compare_fractional_parts(a: &[u8], b: &[u8]) -> Ordering {
1170 let max_len = a.len().max(b.len());
1171
1172 for i in 0..max_len {
1173 let a_digit = a.get(i).copied().unwrap_or(b'0');
1174 let b_digit = b.get(i).copied().unwrap_or(b'0');
1175
1176 let cmp = a_digit.cmp(&b_digit);
1177 if cmp != Ordering::Equal {
1178 return cmp;
1179 }
1180 }
1181
1182 Ordering::Equal
1183}
1184
1185fn skip_leading_zeros(bytes: &[u8]) -> &[u8] {
1186 let start = bytes.iter().position(|&b| b != b'0').unwrap_or(bytes.len());
1187 if start == bytes.len() {
1188 b"0" } else {
1190 &bytes[start..]
1191 }
1192}
1193
1194pub fn compare_case_insensitive(a: &[u8], b: &[u8]) -> Ordering {
1196 if locale::LocaleConfig::is_enabled() {
1198 locale::smart_compare(a, b, true)
1199 } else {
1200 let min_len = a.len().min(b.len());
1202
1203 for i in 0..min_len {
1204 let a_char = a[i].to_ascii_lowercase();
1205 let b_char = b[i].to_ascii_lowercase();
1206
1207 match a_char.cmp(&b_char) {
1208 Ordering::Equal => continue,
1209 other => return other,
1210 }
1211 }
1212
1213 a.len().cmp(&b.len())
1214 }
1215}
1216
1217#[cfg(test)]
1218mod tests {
1219 use super::*;
1220
1221 #[test]
1222 fn test_simple_line_creation() {
1223 let data = b"hello world";
1224 let line = Line::new(data);
1225
1226 unsafe {
1227 assert_eq!(line.as_bytes(), b"hello world");
1228 }
1229 assert_eq!(line.len(), 11);
1230 }
1231
1232 #[test]
1233 fn test_numeric_comparison() {
1234 let a = Line::new(b"123");
1235 let b = Line::new(b"456");
1236 let c = Line::new(b"123");
1237
1238 assert_eq!(compare_numeric_lines(&a, &b), Ordering::Less);
1239 assert_eq!(compare_numeric_lines(&b, &a), Ordering::Greater);
1240 assert_eq!(compare_numeric_lines(&a, &c), Ordering::Equal);
1241 }
1242
1243 #[test]
1244 fn test_simple_int_parsing() {
1245 assert_eq!(parse_int(b"123"), Some(123));
1246 assert_eq!(parse_int(b"-456"), Some(-456));
1247 assert_eq!(parse_int(b"+789"), Some(789));
1248 assert_eq!(parse_int(b"0"), Some(0));
1249 assert_eq!(parse_int(b""), Some(0));
1250 assert_eq!(parse_int(b"12.34"), None); assert_eq!(parse_int(b"abc"), None); }
1253
1254 #[test]
1255 fn test_parse_lines_with_different_endings() {
1256 let unix_data = b"line1\nline2\nline3";
1258 let unix_lines = parse_lines(unix_data);
1259 assert_eq!(unix_lines.len(), 3);
1260 unsafe {
1261 assert_eq!(unix_lines[0].as_bytes(), b"line1");
1262 assert_eq!(unix_lines[1].as_bytes(), b"line2");
1263 assert_eq!(unix_lines[2].as_bytes(), b"line3");
1264 }
1265
1266 let windows_data = b"line1\r\nline2\r\nline3\r\n";
1268 let windows_lines = parse_lines(windows_data);
1269 assert_eq!(windows_lines.len(), 3);
1270 unsafe {
1271 assert_eq!(windows_lines[0].as_bytes(), b"line1");
1272 assert_eq!(windows_lines[1].as_bytes(), b"line2");
1273 assert_eq!(windows_lines[2].as_bytes(), b"line3");
1274 }
1275
1276 let mixed_data = b"line1\r\nline2\nline3\r";
1278 let mixed_lines = parse_lines(mixed_data);
1279 assert_eq!(mixed_lines.len(), 3);
1280 unsafe {
1281 assert_eq!(mixed_lines[0].as_bytes(), b"line1");
1282 assert_eq!(mixed_lines[1].as_bytes(), b"line2");
1283 assert_eq!(mixed_lines[2].as_bytes(), b"line3");
1284 }
1285
1286 let single_data = b"single_line";
1288 let single_lines = parse_lines(single_data);
1289 assert_eq!(single_lines.len(), 1);
1290 unsafe {
1291 assert_eq!(single_lines[0].as_bytes(), b"single_line");
1292 }
1293 }
1294}