Skip to main content

provenant/utils/
text.rs

1use std::path::Path;
2
3const UTF8_BOM_CHAR: char = '\u{FEFF}';
4
5const SOURCE_EXTENSIONS: &[&str] = &[
6    ".ada", ".adb", ".asm", ".asp", ".aj", ".bas", ".bat", ".c", ".c++", ".cc", ".clj", ".cob",
7    ".cpp", ".cs", ".csh", ".csx", ".cxx", ".d", ".e", ".el", ".f", ".fs", ".f77", ".f90", ".for",
8    ".fth", ".ftn", ".go", ".h", ".hh", ".hpp", ".hs", ".html", ".htm", ".hxx", ".java", ".js",
9    ".jsx", ".jsp", ".ksh", ".kt", ".lisp", ".lua", ".m", ".m4", ".nim", ".pas", ".php", ".pl",
10    ".pp", ".ps1", ".py", ".r", ".rb", ".ruby", ".rs", ".s", ".scala", ".sh", ".swift", ".ts",
11    ".vhdl", ".verilog", ".vb", ".groovy", ".po",
12];
13
14pub fn is_source(path: &Path) -> bool {
15    path.extension()
16        .map(|ext| {
17            let ext_str = ext.to_string_lossy();
18            let ext_lower = format!(".{}", ext_str.to_lowercase());
19            SOURCE_EXTENSIONS.contains(&ext_lower.as_str())
20        })
21        .unwrap_or(false)
22}
23
24pub fn remove_verbatim_escape_sequences(s: &str) -> String {
25    s.replace("\\r", " ")
26        .replace("\\n", " ")
27        .replace("\\t", " ")
28}
29
30pub fn strip_utf8_bom_str(s: &str) -> &str {
31    s.strip_prefix(UTF8_BOM_CHAR).unwrap_or(s)
32}
33
34#[cfg(test)]
35mod tests {
36    use super::*;
37    use std::path::PathBuf;
38
39    #[test]
40    fn test_strip_utf8_bom_str_with_bom() {
41        let s = "\u{FEFF}Hello World";
42        assert_eq!(strip_utf8_bom_str(s), "Hello World");
43    }
44
45    #[test]
46    fn test_strip_utf8_bom_str_without_bom() {
47        let s = "Hello World";
48        assert_eq!(strip_utf8_bom_str(s), "Hello World");
49    }
50
51    #[test]
52    fn test_strip_utf8_bom_str_empty() {
53        let s = "";
54        assert_eq!(strip_utf8_bom_str(s), "");
55    }
56
57    #[test]
58    fn test_strip_utf8_bom_str_only_bom() {
59        let s = "\u{FEFF}";
60        assert_eq!(strip_utf8_bom_str(s), "");
61    }
62
63    #[test]
64    fn test_bom_character_is_not_whitespace() {
65        let s = "\u{FEFF}Hello";
66        assert_ne!(s.trim(), "Hello");
67        assert_eq!(strip_utf8_bom_str(s), "Hello");
68    }
69
70    #[test]
71    fn test_is_source_rust() {
72        assert!(is_source(&PathBuf::from("test.rs")));
73        assert!(is_source(&PathBuf::from("TEST.RS")));
74    }
75
76    #[test]
77    fn test_is_source_python() {
78        assert!(is_source(&PathBuf::from("script.py")));
79    }
80
81    #[test]
82    fn test_is_source_javascript() {
83        assert!(is_source(&PathBuf::from("app.js")));
84    }
85
86    #[test]
87    fn test_is_source_c() {
88        assert!(is_source(&PathBuf::from("options.c")));
89        assert!(is_source(&PathBuf::from("OPTIONS.C")));
90    }
91
92    #[test]
93    fn test_is_source_not_source() {
94        assert!(!is_source(&PathBuf::from("README.md")));
95        assert!(!is_source(&PathBuf::from("data.json")));
96        assert!(!is_source(&PathBuf::from("config.yaml")));
97    }
98
99    #[test]
100    fn test_is_source_no_extension() {
101        assert!(!is_source(&PathBuf::from("Makefile")));
102    }
103
104    #[test]
105    fn test_remove_verbatim_escape_sequences_basic() {
106        let input = "line1\\nline2\\rline3\\tline4";
107        let output = remove_verbatim_escape_sequences(input);
108        assert_eq!(output, "line1 line2 line3 line4");
109    }
110
111    #[test]
112    fn test_remove_verbatim_escape_sequences_only_backslash_n() {
113        let input = "hello\\nworld";
114        let output = remove_verbatim_escape_sequences(input);
115        assert_eq!(output, "hello world");
116    }
117
118    #[test]
119    fn test_remove_verbatim_escape_sequences_no_escapes() {
120        let input = "normal text without escapes";
121        let output = remove_verbatim_escape_sequences(input);
122        assert_eq!(output, input);
123    }
124
125    #[test]
126    fn test_remove_verbatim_escape_sequences_actual_newline() {
127        let input = "line1\nline2";
128        let output = remove_verbatim_escape_sequences(input);
129        assert_eq!(output, "line1\nline2");
130    }
131
132    #[test]
133    fn test_remove_verbatim_escape_sequences_multiple() {
134        let input = "a\\nb\\nc\\n";
135        let output = remove_verbatim_escape_sequences(input);
136        assert_eq!(output, "a b c ");
137    }
138
139    #[test]
140    fn test_remove_verbatim_escape_sequences_options_c_sample() {
141        let input = "Try `progname --help' for more information.\\n";
142        let output = remove_verbatim_escape_sequences(input);
143        assert_eq!(output, "Try `progname --help' for more information. ");
144    }
145
146    #[test]
147    fn test_is_source_options_c() {
148        let path = PathBuf::from("testdata/license-golden/datadriven/lic2/regression/options.c");
149        assert!(
150            is_source(&path),
151            "options.c should be recognized as source file"
152        );
153    }
154}