htmlprep/
lib.rs

1use std::{io, error, fmt, fs, str::{self, Utf8Error}, path::{PathBuf, Path}, sync::OnceLock, collections::HashMap}; 
2use regex::bytes::{Regex, RegexBuilder, Captures};
3
4static RE_INCLUDE: OnceLock<Regex> = OnceLock::new();
5const INCLUDE_REGEX: &str = r##"<!--\s*?#include\s+"([^"]+)"\s*?-->"##;
6static RE_PLACEHOLDER: OnceLock<Regex> = OnceLock::new();
7const PLACEHOLDER_REGEX: &str = r##"<!--\s*?#placeholder\s+"([^"]+)"\s*?-->"##;
8
9#[derive(Debug)]
10pub struct Error {
11    kind: ErrorType,
12    source: Option<Box<dyn error::Error>>,
13}
14
15#[derive(Debug)]
16enum ErrorType {
17    DirExists,
18    IO,
19    Utf8Parse,
20    SimLinkFound,
21    NoFilename,
22}
23
24impl fmt::Display for Error {
25    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
26        write!(f, "Error type: {:?} caused by: {:?} ", self.kind, self.source)
27    }
28}
29
30impl error::Error for Error {}
31
32impl From<io::Error> for Error {
33    fn from(error: io::Error) -> Self {
34        Error{kind: ErrorType::IO, source: Some(Box::new(error))}
35    }
36}
37
38impl From<Utf8Error> for Error {
39    fn from(error: Utf8Error) -> Self {
40        Error{kind: ErrorType::Utf8Parse, source: Some(Box::new(error))}
41    }
42}
43
44#[derive(Debug)]
45pub struct Html {
46    bytes: Vec<u8>
47}
48
49impl From<Html> for Vec<u8> {
50    fn from(value: Html) -> Self {
51        return value.bytes;
52    }
53}
54
55impl From<Vec<u8>> for Html {
56    fn from(value: Vec<u8>) -> Self {
57        return Html { bytes: value };
58    }
59}
60
61pub struct CompileOptions<'a> {
62    pub process_extensions: HashMap<String, String>,
63    pub skip_extensions: Vec<String>,
64    pub source: &'a Path,
65    pub dest: &'a Path,
66    pub root: &'a Path,
67}
68
69impl Default for CompileOptions<'_> {
70    fn default() -> Self {
71        Self {
72            process_extensions: HashMap::from([
73                ("html".to_owned(), "html".to_owned()),
74                ("htmlraw".to_owned(), "html".to_owned()),
75            ]),
76            skip_extensions: vec![
77                String::from("htmlsnippet"),
78                String::from("htmlprep"),
79                String::from("")
80            ],
81            source: Path::new("."),
82            dest: Path::new("processed_html"),
83            root: Path::new("/"),
84        }
85    }
86}
87
88impl CompileOptions<'_> {
89    pub fn compile(&self) -> Result<(), Error> 
90    {
91        // Do not allow dest dir to overwrite an existing file
92        match fs::metadata(self.dest) {
93            Err(error) => {
94                if error.kind().eq(&io::ErrorKind::NotFound) {
95                    fs::create_dir_all(self.dest)?;
96                }
97            },
98            Ok(dest_metadata) => {
99                if !dest_metadata.is_dir() {
100                    return Err(Error{kind: ErrorType::DirExists, source: None});
101                }
102            }
103        }
104        // Ensure that the root dir is a dir and actually exists
105        {
106            let root_metadata = fs::metadata(self.root)?;
107            if !root_metadata.is_dir() {
108                return Err(Error{kind: ErrorType::DirExists, source: None});
109            }
110        }
111        // All checkable options are valid. Begin processing.
112        let source_metadata = fs::metadata(self.source)?;
113        if source_metadata.is_dir() {
114            self.compile_dir(self.source, self.dest)?;
115        } else if source_metadata.is_file() {
116            let processed = process_file(self.source, self.root)?;
117            fs::write(
118                self.dest.join(self.source.file_name().unwrap()),
119                &processed
120            )?;
121        }
122        Ok(())
123    }
124
125    /// Recursively tries to process every file in a directory. Intentionally
126    /// left private.
127    fn compile_dir(&self, source: &Path, dest: &Path) -> Result<(), Error> {
128        let dir_entries = source.read_dir()?;
129        for entry in dir_entries {
130            let file = entry?;
131            let file_type = file.file_type()?;
132            let file_path = file.path();
133            let file_extension = match file_path.extension() {
134                Some(extension) => extension.to_str().unwrap().to_owned(),
135                None => String::from(""),
136            };
137            if self.skip_extensions.contains(&file_extension) {
138                continue;
139            }
140            let dest = dest.join(&file.file_name());
141            if file_type.is_dir() {
142                self.compile_dir(&file_path, &dest)?; 
143            } else if file_type.is_file() {
144                match self.process_extensions.get(&file_extension) {
145                    None => {
146                        fs::copy(&file_path, &dest)?;
147                    },
148                    Some(extension) => {
149                        let processed = process_file(&file.path(), self.root)?;
150                        fs::write(&dest.with_extension(extension), &processed)?;
151                    },
152                }
153            } else {
154                return Err(Error{ kind: ErrorType::SimLinkFound, source: None });
155            }
156        }
157        Ok(())
158    } 
159
160}
161
162// TODO ASAP:
163// - Better tests
164
165// TODO would be nice:
166// - It would be nice if there was a cmdline util
167// - It would be even nicer if you could specify a dir on a remote server to ssh
168//   into and copy all files over to in one fell swoop
169// - It would be nice if you could run it PHP style, with a dynamic server that
170//   processes all files as they come, but this should only be used for testing
171//   purposes, because the goal of this project is MAINLY for generating static
172//   content easier
173
174/// Processes a single file and returns a structure containing the entire file
175/// in memory
176pub fn process_file(file: &Path, webroot: &Path) -> Result<Vec<u8>, Error> {
177    if file.file_name().is_none() {
178        return Err(Error {kind: ErrorType::NoFilename, source: None });
179    }
180
181    let raw_html = fs::read(file)?;
182    return Ok(Html::process(raw_html.as_slice(), webroot, unsafe{file.parent().unwrap_unchecked()} 
183        /* a file with a name is guaranteed to also have a parent */
184    )?.into());
185}
186
187impl Html {
188    /// Pre-processes a slice of bytes.
189    pub fn process(html: &[u8], website_root: &Path, cwd: &Path) -> Result<Self, Error> {
190        let re_include = RE_INCLUDE.get_or_init(|| RegexBuilder::new(INCLUDE_REGEX)
191            .dot_matches_new_line(true)
192            .build()
193            .unwrap()
194        );
195        let mut processed_html = html.to_vec();
196        let include_captures: Vec<Captures>  = re_include.captures_iter(&html).collect();
197        for capture in include_captures.iter().rev() {
198            let comment = unsafe{ capture.get(0).unwrap_unchecked() };
199            let comment_path = unsafe{ capture.get(1).unwrap_unchecked() };
200            let include_path = make_path_absolute(&comment_path.as_bytes(), website_root, cwd)?;
201            let include_contents = fs::read(include_path)?;
202            let comment_range = comment.start()..comment.end();
203            processed_html.splice(comment_range, include_contents);
204        }
205    
206        return Ok(processed_html.into());
207    }
208
209    /// Returns all `placeholders` in the file. A `placeholder` is a range in
210    /// the file which is meant to be replaced server-side each time the file is
211    /// requested. A `placeholder` is defined in an html file using a
212    /// `#placeholder` comment. This allows for arbitrary insertion of HTML at
213    /// runtime. The order of replacement does not matter.
214    /// 
215    /// # Examples
216    /// 
217    /// ```
218    /// use std::path::Path;
219    /// use std::str;
220    /// use htmlprep::*;
221    /// 
222    /// fn main() -> Result<(), Box<dyn std::error::Error>> {
223    ///     let raw_html = r##"
224    ///             <!DOCTYPE html><html><body>
225    ///                 <!-- #placeholder "name" -->
226    ///                 <!-- #placeholder "visitor-number" -->
227    ///             </body></html>"##.as_bytes();
228    ///     
229    ///     let mut html: Html = raw_html.to_vec().into();
230    ///     let mut placeholders = html.get_placeholders()?;
231    ///     assert!(placeholders.contains("name"));
232    ///     assert!(placeholders.contains("visitor-number"));
233    ///     
234    ///     let name = "Alice";
235    ///     let name_replacement = format!("<p>Welcome to the site, <b>{name}!</b></p>");
236    ///     html.replace_placeholder(&mut placeholders, "name", name_replacement.as_bytes());
237    ///     let visitor_number = 1234;
238    ///     let visitor_num_replacement = format!("<p>You are visitor number: {visitor_number}</p>");
239    ///     html.replace_placeholder(&mut placeholders, "visitor-number", visitor_num_replacement.as_bytes());
240    ///     // Calling this function again is a no-op
241    ///     html.replace_placeholder(&mut placeholders, "visitor-number", visitor_num_replacement.as_bytes());
242    ///     
243    ///     let html_vec: Vec<u8> = html.into();
244    ///     let result = r##"
245    ///             <!DOCTYPE html><html><body>
246    ///                 <p>Welcome to the site, <b>Alice!</b></p>
247    ///                 <p>You are visitor number: 1234</p>
248    ///             </body></html>"##.as_bytes().to_vec();
249    ///     
250    ///     assert!(result.eq(&html_vec));
251    ///     return Ok(());
252    /// }
253    /// ```
254    pub fn get_placeholders(&self) -> Result<Placeholders, Error> {
255        let re_placeholder = RE_PLACEHOLDER.get_or_init(|| RegexBuilder::new(PLACEHOLDER_REGEX)
256            .dot_matches_new_line(true)
257            .build()
258            .unwrap()
259        );
260
261        let mut placeholders = Vec::new();
262        for capture in  re_placeholder.captures_iter(&self.bytes) {
263            let comment = unsafe{ capture.get(0).unwrap_unchecked() };
264            let placeholder_name = unsafe{ capture.get(1).unwrap_unchecked() };
265            let name = str::from_utf8(placeholder_name.as_bytes())?;
266            placeholders.push(
267                Placeholder {
268                    start: comment.start(),
269                    end: comment.end(),
270                    name: name.to_owned(),
271                }
272            )
273        }
274        
275        return Ok(placeholders.into());
276    }
277
278    /// Replaces the `placeholder_name` placeholder in the calling Html struct
279    /// with `replacement`. Upon completion of this function, the replaced
280    /// Placeholder will be removed from `placeholders`. 
281    pub fn replace_placeholder(&mut self, placeholders: &mut Placeholders, placeholder_name: &str, replacement: &[u8]) {
282        // Placeholders are kept in sorted order so that only what's necessary to update can be updated 
283        if let Some(index) = placeholders.data.iter().position(|p| p.name.eq(placeholder_name)) {
284            let to_be_replaced = placeholders.data.remove(index);
285            let bytes_added: isize = replacement.len() as isize - (to_be_replaced.end - to_be_replaced.start) as isize;
286            for i in index..placeholders.data.len() {
287                let placeholder = placeholders.data.get_mut(i).unwrap();
288                placeholder.start = (placeholder.start as isize + bytes_added) as usize;
289                placeholder.end = (placeholder.end as isize + bytes_added) as usize;
290            }
291            self.bytes.splice(to_be_replaced.start..to_be_replaced.end, replacement.to_vec());
292        }
293    }
294}
295
296#[derive(Debug)]
297pub struct Placeholders {
298    data: Vec<Placeholder>
299}
300
301impl Placeholders {
302    pub fn contains<T>(&self, value: &T) -> bool
303    where
304        T: ?Sized, 
305        Placeholder: PartialEq<T>,
306    {
307        self.data.iter().any(|val| val == value)
308    }
309}
310
311impl PartialEq<str> for Placeholder {
312    fn eq(&self, other: &str) -> bool {
313        self.name == other
314    }
315}
316
317impl From<Vec<Placeholder>> for Placeholders {
318    fn from(value: Vec<Placeholder>) -> Self {
319        Self { data: value }
320    }
321}
322
323#[derive(Debug, PartialEq)]
324pub struct Placeholder {
325    start: usize,
326    end: usize,
327    name: String,
328}
329
330
331
332/// Processes all files in `source` and places the results into the dir in
333/// `dest`. `source` can be either a file or a directory, but `dest` must only
334/// be a directory. Processing means that all #include comments in the source
335/// html are replaced with the file specified in the comment. Processing will
336/// not replace #placeholder comments, as these are mean to be replaced
337/// dynamically each time the file is requested. 
338///
339/// # Examples
340///
341/// ```
342/// fn main() {
343///     htmlprep::compile("/var/www/staging", "/var/www/prod", "/");
344///     // All files in staging will be copied to prod
345/// }
346/// ```
347pub fn compile(source: &str, dest: &str, webroot: &str) -> Result<(), Error> 
348{
349    let mut options = CompileOptions::default();
350    options.source = Path::new(source);
351    options.dest = Path::new(dest);
352    options.root = Path::new(webroot);
353    return options.compile();
354}
355
356/// Web servers usually change their root dir before serving files. Thus, paths
357/// in html files are likely to be based on a different root, however, this
358/// library will probably be called by a user who has not changed their root.
359/// Thus, this function is necessary to change the root of any absolute paths in
360/// html files. 
361fn make_path_absolute(path_in_comment: &[u8], website_root: &Path, cwd: &Path) -> Result<Box<Path>, core::str::Utf8Error> {
362    let path_as_str = str::from_utf8(path_in_comment)?;
363    if path_as_str.starts_with('/') {
364        let x = Ok(website_root.join(PathBuf::from(&path_as_str[1..])).into_boxed_path());
365        return x;
366    } else {
367        let x = Ok(cwd.join(PathBuf::from(&path_as_str)).into_boxed_path());
368        return x;
369    }
370}