syzlang_parser/
token.rs

1//! Parse Syzkaller language into a series of tokens for future processing.
2
3use crate::{generror, verify, Error, Result};
4use log::{debug, error, trace};
5use serde::{Deserialize, Serialize};
6use std::path::Path;
7
8/// All the different tokens we divide the language into
9#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
10pub enum Token {
11	/// Include a C header file
12	Include,
13
14	/// Includes a directory of C header files
15	Incdir,
16
17	/// Resource keyword indicating a resource is being declared
18	Resource,
19
20	/// Type keyword, indicating a type being declared
21	Type,
22
23	/// Define statement indicating a C macro expression is coming next
24	Define,
25
26	/// Meta keyword, next entry can be some meta information about the file
27	Meta,
28
29	/// Char: <
30	CrocOpen,
31
32	/// Char: >
33	CrocClose,
34
35	/// Char: (
36	ParenOpen,
37
38	/// Char: )
39	ParenClose,
40
41	/// Char: }
42	BracketOpen,
43
44	/// Char: }
45	BracketClose,
46
47	/// Char: [
48	SquareOpen,
49
50	/// Char: ]
51	SquareClose,
52
53	/// Char: :
54	Colon,
55
56	/// Char: ,
57	Comma,
58	Newline,
59
60	/// Char: =
61	Equal,
62
63	/// Char: $
64	Dollar,
65
66	/// The text has been processed as a comment, because it started with '#'
67	Comment(String),
68
69	/// Some string enclosed in double quotes
70	String(String),
71
72	/// Some identifier we didn't match to any keyword
73	Name(String),
74
75	/// Character enclosed in single quotes
76	Char(char),
77}
78
79impl Token {
80	/// Load all tokens from file
81	pub fn from_file(s: &Path) -> Result<Vec<Token>> {
82		debug!("loading file {s:?}");
83		let data = std::fs::read(s)?;
84		let data = std::str::from_utf8(&data)?;
85		Self::create_from_str(data)
86	}
87	/// Load all files matching glob pattern
88	pub fn all_globs(dir: &Path, pattern: &str) -> Result<Vec<Vec<Token>>> {
89		debug!("loading globs {pattern} @ {dir:?}");
90		if let Some(q) = dir.as_os_str().to_str() {
91			let mut q = q.to_string();
92			q.push('/');
93			q.push_str(pattern);
94
95			let mut ret = Vec::new();
96			for file in glob::glob(&q).unwrap() {
97				let file = file.unwrap();
98				debug!("file {file:?}");
99				let n = Self::from_file(file.as_path())?;
100				ret.push(n);
101			}
102			Ok(ret)
103		} else {
104			Err(Error::Error(format!(
105				"unable to parse dir to string {dir:?}"
106			)))
107		}
108	}
109
110	/// Load all data from string
111	pub fn create_from_str(data: &str) -> Result<Vec<Token>> {
112		trace!("parsing '{data}'");
113		let mut ret = Vec::new();
114		let mut curr = String::default();
115		let mut quote = None;
116
117		// let parts: Vec<_> = data.split('\n').collect();
118		for (i, line) in data.split('\n').enumerate() {
119			trace!("line[{i}]: {line}");
120
121			// TODO: Doesn't preserve number of spaces or type of whitespace
122			let line = line.trim();
123			if quote.is_none() && line.is_empty() {
124				ret.push(Token::Newline);
125				continue;
126			}
127
128			if line.starts_with('#') {
129				let ins = Token::Comment(line.to_string());
130				ret.push(ins);
131				ret.push(Token::Newline);
132				continue;
133			}
134			for item in line.split([' ', '\t']) {
135				trace!("item = '{item}'");
136				if let Some(q) = &quote {
137					curr.push(' ');
138					curr.push_str(item);
139					if Self::quote_enclosed(&curr, *q) || *q == '\'' {
140						Self::parse_loop(curr, &mut ret)?;
141						// curr.clear();
142						curr = String::default();
143						quote = None;
144						continue;
145					}
146				} else if !Self::quote_enclosed(item, '"') {
147					quote = Some('"');
148					curr.push_str(item);
149				} else if !Self::quote_enclosed(item, '\'') {
150					quote = Some('\'');
151					curr.push_str(item);
152				} else if !Self::quote_enclosed(item, '`') {
153					quote = Some('`');
154					curr.push_str(item);
155				} else {
156					Self::parse_loop(item, &mut ret)?;
157				}
158			}
159
160			ret.push(Token::Newline);
161		}
162		// We always add an extra newline at the end
163		ret.pop();
164
165		if !curr.is_empty() {
166			return Err(Self::error(format!(
167				"remaining data from unenclosed quote '{curr}'"
168			)));
169		}
170		let ret = Self::post_proc(ret);
171		Ok(ret)
172	}
173
174	/// Get token as string, but only it it's a valid name identifier
175	pub fn to_name(&self) -> Result<&String> {
176		debug!("calling to_name {self:?}");
177		match self {
178			Token::Name(n) => Ok(n),
179			_ => generror!(format!("cannot parse {self:?} as string")),
180		}
181	}
182	fn error<S: Into<String>>(err: S) -> Error {
183		let err: String = err.into();
184		error!("tokenize error {err}");
185		Error::Tokenize(err)
186	}
187	fn valid_name_char(c: char) -> bool {
188		c.is_ascii_lowercase()
189			|| c.is_ascii_uppercase()
190			|| c.is_ascii_digit()
191			|| c == '_'
192			|| c == '/'
193			|| c == '.'
194			|| c == '?'
195			|| c == '-'
196			|| c == '\''
197	}
198	fn post_proc(mut tokens: Vec<Token>) -> Vec<Token> {
199		let mut ret = Vec::with_capacity(tokens.len());
200		let mut paren = 0;
201		let mut bracket = 0;
202		let mut square = 0;
203		while !tokens.is_empty() {
204			let r = tokens.remove(0);
205			match &r {
206				Token::ParenOpen => paren += 1,
207				Token::ParenClose => paren -= 1,
208				Token::BracketOpen => bracket += 1,
209				Token::BracketClose => bracket -= 1,
210				Token::SquareOpen => square += 1,
211				Token::SquareClose => square -= 1,
212				Token::Type => {
213					if paren > 0 || bracket > 0 || square > 0 {
214						// We should never have a type specifier inside
215						// function, structs or unions. If it is, we assume it's
216						// an argument name
217						ret.push(Token::Name(String::from("type")));
218						continue;
219					}
220				}
221				Token::Meta => {
222					if let Some(x) = tokens.first() {
223						if let Token::Name(n) = x {
224							if n != "noextract" && n != "arches" {
225								ret.push(Token::Name(String::from("meta")));
226								continue;
227							}
228							// Was a correct meta token, we push it below
229						}
230					} else {
231						ret.push(Token::Name(String::from("meta")));
232						continue;
233					}
234				}
235				_ => {}
236			}
237			ret.push(r);
238		}
239		ret
240	}
241	fn parse(s: String) -> Result<(Self, Option<String>)> {
242		trace!("parse {s}");
243		verify!(!s.is_empty(), UnexpectedToken);
244		let mut ss = s.chars();
245		let f = ss.next().unwrap();
246		let rem: String = ss.collect();
247		trace!("checking char {f:?}");
248		trace!("rem {rem:?}");
249		let n = match f {
250			'(' => (Token::ParenOpen, Some(rem)),
251			')' => (Token::ParenClose, Some(rem)),
252			'[' => (Token::SquareOpen, Some(rem)),
253			']' => (Token::SquareClose, Some(rem)),
254			'{' => (Token::BracketOpen, Some(rem)),
255			'}' => (Token::BracketClose, Some(rem)),
256			':' => (Token::Colon, Some(rem)),
257			'<' => (Token::CrocOpen, Some(rem)),
258			'>' => (Token::CrocClose, Some(rem)),
259			',' => (Token::Comma, Some(rem)),
260			'=' => (Token::Equal, Some(rem)),
261			'$' => (Token::Dollar, Some(rem)),
262			'\'' => {
263				let val = rem.chars().next();
264				let nq = rem.chars().nth(1);
265				if nq == Some('\'') {
266					(Token::Char(val.unwrap()), Some(rem[2..].to_string()))
267				} else {
268					(Token::String(String::from("'")), Some(rem))
269				}
270			}
271			'"' | '`' => {
272				if let Some(idx) = rem.find(f) {
273					let str = rem[..idx].to_string();
274					let rem = rem[idx + 1..].to_string();
275					(Token::String(str), Some(rem))
276				} else {
277					return Err(Self::error(format!(
278						"Unable to find enclosing quote in {rem}"
279					)));
280				}
281			}
282			'\n' => (Token::Newline, Some(rem)),
283			_ => {
284				// rem.insert(0, f);
285				let empty = None;
286				match s.as_str() {
287					"include" => (Token::Include, empty),
288					"incdir" => (Token::Incdir, empty),
289					"resource" => (Token::Resource, empty),
290					"type" => (Token::Type, empty),
291					"define" => (Token::Define, empty),
292					"meta" => (Token::Meta, empty),
293					_ => {
294						let mut start = String::from("");
295						start.push(f);
296
297						let mut prem = String::from("");
298						let mut ss = rem.chars();
299
300						while let Some(c) = ss.next() {
301							if Self::valid_name_char(c) {
302								start.push(c)
303							} else {
304								prem.push(c);
305								let ins: String = ss.collect();
306								prem.push_str(&ins);
307								break;
308							}
309						}
310						trace!("start {start} | prem: '{prem}'");
311						let ins = Token::Name(start);
312						(ins, Some(prem))
313					}
314				}
315			}
316		};
317		Ok(n)
318	}
319	fn quote_enclosed(s: &str, quote: char) -> bool {
320		let chars = s.chars();
321		let mut count = 0;
322
323		for n in chars {
324			if n == quote {
325				count += 1;
326			}
327		}
328		count % 2 == 0
329	}
330	fn parse_loop<S: Into<String>>(item: S, tokens: &mut Vec<Token>) -> Result<()> {
331		let mut item: String = item.into();
332		while !item.is_empty() {
333			let (ins, rem) = Token::parse(item)?;
334			tokens.push(ins);
335			if let Some(n) = rem {
336				item = n;
337			} else {
338				break;
339			}
340		}
341		Ok(())
342	}
343}
344
345#[cfg(test)]
346mod test {
347	use super::*;
348
349	#[test]
350	fn tokens0() {
351		let s = r#"resource fd[int32]: -1"#;
352		let t = Token::create_from_str(s).unwrap();
353		assert_eq!(
354			t,
355			vec![
356				Token::Resource,
357				Token::Name(String::from("fd")),
358				Token::SquareOpen,
359				Token::Name(String::from("int32")),
360				Token::SquareClose,
361				Token::Colon,
362				Token::Name(String::from("-1")),
363			]
364		);
365	}
366
367	#[test]
368	fn tokens1() {
369		let s = r#"abcd = "hello", `world`, "!", "Hello World!", `acdb efgh`"#;
370		let t = Token::create_from_str(s).unwrap();
371		assert_eq!(
372			t,
373			vec![
374				Token::Name(String::from("abcd")),
375				Token::Equal,
376				Token::String(String::from("hello")),
377				Token::Comma,
378				Token::String(String::from("world")),
379				Token::Comma,
380				Token::String(String::from("!")),
381				Token::Comma,
382				Token::String(String::from("Hello World!")),
383				Token::Comma,
384				Token::String(String::from("acdb efgh"))
385			]
386		);
387	}
388
389	#[test]
390	fn tokens2() {
391		// Check that extra newlines are preserved
392		let s = r#"
393# Some comment
394
395func$abcd(type int32, meta int64) fd
396
397"#;
398		let t = Token::create_from_str(s).unwrap();
399		assert_eq!(
400			t,
401			vec![
402				Token::Newline,
403				Token::Comment(String::from("# Some comment")),
404				Token::Newline,
405				Token::Newline,
406				Token::Name(String::from("func")),
407				Token::Dollar,
408				Token::Name(String::from("abcd")),
409				Token::ParenOpen,
410				Token::Name(String::from("type")),
411				Token::Name(String::from("int32")),
412				Token::Comma,
413				Token::Name(String::from("meta")),
414				Token::Name(String::from("int64")),
415				Token::ParenClose,
416				Token::Name(String::from("fd")),
417				Token::Newline,
418				Token::Newline
419			]
420		);
421	}
422	#[test]
423	fn tokens3() {
424		let s = r#"const[' ', int8]"#;
425		let t = Token::create_from_str(s).unwrap();
426		assert_eq!(
427			t,
428			vec![
429				Token::Name(String::from("const")),
430				Token::SquareOpen,
431				Token::Char(' '),
432				Token::Comma,
433				Token::Name(String::from("int8")),
434				Token::SquareClose
435			]
436		);
437	}
438
439	#[test]
440	fn bad_tokens0() {
441		let s = r#"value = "asd", "qwert"#;
442		let t = Token::create_from_str(s);
443		assert!(t.is_err());
444	}
445}