Skip to main content

pdf_min/
html.rs

1use crate::*;
2use std::collections::BTreeMap;
3
4/// Convert source html to PDF using Writer w.
5pub fn html(w: &mut Writer, source: &[u8]) {
6    let mut p = Parser::new(source);
7    p.read_token();
8    html_inner(w, &mut p, b"");
9}
10
11#[derive(Debug)]
12enum Token {
13    Text,
14    Tag,
15    WhiteSpace,
16    Eof,
17}
18
19struct Parser<'a> {
20    source: &'a [u8],
21    position: usize,
22    token_start: usize,
23    token_end: usize,
24    end_tag: bool,
25    token: Token,
26    attr: BTreeMap<&'a [u8], &'a [u8]>,
27}
28
29impl<'a> Parser<'a> {
30    fn new(source: &'a [u8]) -> Self {
31        Self {
32            source,
33            position: 0,
34            token_start: 0,
35            token_end: 0,
36            end_tag: false,
37            token: Token::Eof,
38            attr: BTreeMap::new(),
39        }
40    }
41
42    fn tvalue(&self) -> &'a [u8] {
43        &self.source[self.token_start..self.token_end]
44    }
45
46    /// Get attribute value,
47    fn avalue(&self, name: &'a [u8]) -> Option<&&'a [u8]> {
48        self.attr.get(name)
49    }
50
51    /// Get integer attribute, return None on not present or error.
52    fn aint(&self, name: &'a [u8]) -> Option<Px> {
53        if let Some(s) = self.avalue(name)
54            && let Ok(x) = tos(s).parse::<Px>()
55        {
56            return Some(x);
57        }
58        None
59    }
60
61    fn next(&mut self) -> u8 {
62        if self.position == self.source.len() {
63            0
64        } else {
65            let c = self.source[self.position];
66            self.position += 1;
67            c
68        }
69    }
70
71    fn next_non_space(&mut self) -> u8 {
72        loop {
73            let c = self.next();
74            if c != b' ' {
75                return c;
76            }
77        }
78    }
79
80    fn read_tag_attributes(&mut self) {
81        // Example: width = 15 alt = "something" src = "something" >
82        loop {
83            let mut c = self.next_non_space();
84            let attr_name_start = self.position - 1;
85            while c != b'=' && c != b' ' && c != b'>' && c != 0 {
86                c = self.next();
87            }
88            if c == b'>' {
89                return;
90            }
91            let attr_name = &self.source[attr_name_start..self.position - 1];
92            if c == b' ' {
93                c = self.next_non_space();
94            }
95            if c != b'=' {
96                return;
97            }
98            c = self.next_non_space();
99            let start = self.position - 1;
100            let attr = if c == b'"' {
101                // Read quoted attribute
102                c = self.next();
103                while c != b'"' && c != 0 {
104                    c = self.next();
105                }
106                if c != b'"' {
107                    return;
108                }
109                &self.source[start + 1..self.position - 1]
110            } else {
111                // Read unquoted attribute
112                while c != b' ' && c != b'>' && c != 0 {
113                    c = self.next();
114                }
115                &self.source[start..self.position - 1]
116            };
117            self.attr.insert(attr_name, attr);
118            if c == b'>' {
119                return;
120            }
121        }
122    }
123
124    fn read_token(&mut self) {
125        let c = self.next();
126        if c == 0 {
127            self.token = Token::Eof;
128        } else if c == b' ' || c == b'\n' {
129            self.token = Token::WhiteSpace;
130            loop {
131                let c = self.next();
132                if c != b' ' || c != b'\n' {
133                    if c != 0 {
134                        self.position -= 1;
135                    }
136                    break;
137                }
138            }
139        } else if c == b'<' {
140            // e.g. <h1 name=x> or </h1>
141            self.token = Token::Tag;
142            self.token_start = self.position;
143            self.end_tag = false;
144            let mut c = self.next();
145            if c == b'/' {
146                self.end_tag = true;
147                self.token_start = self.position;
148                c = self.next();
149            }
150            loop {
151                // To find end of tag name
152                if c == b' ' {
153                    self.token_end = self.position - 1;
154                    self.read_tag_attributes();
155                    break;
156                } else if c == b'>' {
157                    self.token_end = self.position - 1;
158                    break; // No attributes to parse
159                } else if c == 0 {
160                    self.token = Token::Eof; // Error
161                    return;
162                } else {
163                    c = self.next();
164                }
165            }
166        } else {
167            self.token = Token::Text;
168            self.token_start = self.position - 1;
169            let mut c = self.next();
170            loop {
171                if c == b'<' || c == b' ' || c == b'\n' {
172                    self.position -= 1;
173                    self.token_end = self.position;
174                    break;
175                } else if c == 0 {
176                    self.token_end = self.position;
177                    break;
178                }
179                c = self.next();
180            }
181        }
182    }
183}
184
185fn html_inner(w: &mut Writer, p: &mut Parser, endtag: &[u8]) {
186    loop {
187        match p.token {
188            Token::Eof => {
189                return;
190            }
191            Token::WhiteSpace => {
192                w.space();
193                p.read_token();
194            }
195            Token::Text => {
196                let s = tos(p.tvalue());
197                let s = &html_escape::decode_html_entities(s);
198                w.text(s);
199                p.read_token();
200            }
201            Token::Tag => {
202                let tag = p.tvalue();
203                if p.end_tag {
204                    if tag == endtag {
205                        p.read_token();
206                    }
207                    return;
208                } else if tag == b"p" && tag == endtag {
209                    return;
210                }
211                p.read_token();
212                if tag == b"br" || tag == b"br/" {
213                    w.output_line();
214                } else if tag == b"img" {
215                    if let Some(src) = p.avalue(b"src") {
216                        let width = p.aint(b"width");
217                        let height = p.aint(b"height");
218                        w.image(tos(src), width, height);
219                    }
220                } else {
221                    let save_mode = w.mode;
222                    let save_font = w.cur_font;
223                    let save_font_size = w.font_size;
224                    let mut save: Px = 0;
225                    match tag {
226                        b"p" => w.output_line(),
227                        b"h1" => {
228                            w.font_size = 14;
229                            w.output_line();
230                            save = if w.center { 1 } else { 0 };
231                            w.center = true;
232                        }
233                        b"b" => w.cur_font |= 1,
234                        b"i" => w.cur_font |= 2,
235                        b"title" => w.mode = Mode::Title,
236                        b"html" | b"head" => w.mode = Mode::Head,
237                        b"body" => w.mode = Mode::Normal,
238                        b"sup" => {
239                            save = w.sup;
240                            w.set_sup(w.font_size / 2);
241                        }
242                        b"sub" => {
243                            save = w.sup;
244                            w.set_sup(-w.font_size / 2);
245                        }
246                        _ => {}
247                    }
248                    html_inner(w, p, tag);
249                    w.mode = save_mode;
250                    w.font_size = save_font_size;
251                    w.cur_font = save_font;
252                    match tag {
253                        b"sup" | b"sub" => w.set_sup(save),
254                        b"h1" => {
255                            w.output_line();
256                            w.center = save == 1;
257                        }
258                        _ => {}
259                    }
260                }
261            }
262        }
263    }
264}
265
266/// Convert byte slice into string.
267fn tos(s: &[u8]) -> &str {
268    std::str::from_utf8(s).unwrap()
269}