asm_lsp/
parser.rs

1use std::{
2    collections::HashMap,
3    env::args,
4    fs,
5    io::Write as _,
6    iter::Peekable,
7    path::PathBuf,
8    str::{FromStr, Lines},
9};
10
11use anyhow::{Result, anyhow};
12use htmlentity::entity::ICodedDataTrait;
13use quick_xml::Reader;
14use quick_xml::escape::unescape;
15use quick_xml::events::attributes::Attribute;
16use quick_xml::events::{BytesStart, Event};
17use quick_xml::name::QName;
18use regex::Regex;
19use reqwest;
20use serde::Deserialize;
21use url_escape::encode_www_form_urlencoded;
22
23use crate::{
24    AvrStatusRegister, AvrTiming, InstructionAlias,
25    types::{
26        Arch, Assembler, Directive, ISA, Instruction, InstructionForm, MMXMode, NameToDirectiveMap,
27        NameToInstructionMap, NameToRegisterMap, Operand, OperandType, Register, RegisterBitInfo,
28        RegisterType, RegisterWidth, XMMMode, Z80Timing, Z80TimingInfo,
29    },
30    ustr,
31};
32
33/// Parse all of the register information witin the documentation file
34///
35/// Current function assumes that the RST file is already read and that it's been given a reference
36/// to its contents (`&str`).
37///
38/// # Errors
39///
40/// This function will not error, it maintains a `Result` return type for compatibility
41/// with a macro in the server's test code
42///
43/// # Panics
44///
45/// This function is highly specialized to parse a specific file and will panic
46/// for most mal-formed/unexpected inputs
47pub fn populate_riscv_registers(rst_contents: &str) -> Result<Vec<Register>> {
48    enum ParseState {
49        FileStart,
50        SectionStart,
51        TableStart,
52        TableSeparator,
53        TableEntry,
54        TableEnd,
55        FileEnd,
56    }
57    let mut parse_state = ParseState::FileStart;
58    let mut registers = Vec::new();
59    let mut curr_reg_type: Option<RegisterType> = None;
60    let mut lines = rst_contents.lines().peekable();
61
62    loop {
63        match parse_state {
64            ParseState::FileStart => {
65                let file_header = lines.next().unwrap();
66                assert!(file_header.eq("Register Definitions"));
67                let separator = lines.next().unwrap();
68                assert!(separator.starts_with('='));
69                consume_empty_lines(&mut lines);
70                parse_state = ParseState::SectionStart;
71            }
72            ParseState::SectionStart => {
73                let section_header = lines.next().unwrap();
74                if section_header.contains("Integer") {
75                    curr_reg_type = Some(RegisterType::GeneralPurpose);
76                } else if section_header.contains("Floating Point") {
77                    curr_reg_type = Some(RegisterType::FloatingPoint);
78                } else {
79                    panic!("Unexpected section header: {section_header}");
80                }
81                let separator = lines.next().unwrap();
82                assert!(separator.starts_with('-'));
83                consume_empty_lines(&mut lines);
84                parse_state = ParseState::TableStart;
85            }
86            ParseState::TableStart => {
87                let top = lines.next().unwrap();
88                assert!(top.starts_with('+'));
89                let column_headers = lines.next().unwrap();
90                assert!(
91                    column_headers
92                        .eq("|Register | ABI Name | Description                       | Saver  |")
93                );
94                parse_state = ParseState::TableSeparator;
95            }
96            ParseState::TableSeparator => {
97                let separator = lines.next().unwrap();
98                assert!(separator.starts_with('+'));
99                match lines.peek() {
100                    Some(next) => {
101                        if next.is_empty() {
102                            parse_state = ParseState::TableEnd;
103                        } else {
104                            parse_state = ParseState::TableEntry;
105                        }
106                    }
107                    None => parse_state = ParseState::TableEnd,
108                }
109            }
110            ParseState::TableEntry => {
111                let entries: Vec<&str> = lines
112                    .next()
113                    .unwrap()
114                    .trim_start_matches('|')
115                    .trim_end_matches('|')
116                    .split('|')
117                    .collect();
118                assert!(entries.len() == 4);
119                let saved_info = if entries[3].trim_ascii().is_empty() {
120                    String::new()
121                } else {
122                    format!("\n{} saved", entries[3].trim_ascii())
123                };
124                let description = format!("{}{}", entries[2].trim_ascii(), saved_info);
125                let reg_name = entries[0].trim_ascii().to_lowercase();
126                let curr_register = Register {
127                    name: reg_name,
128                    description: Some(description),
129                    reg_type: curr_reg_type,
130                    arch: Arch::RISCV,
131                    ..Default::default()
132                };
133                registers.push(curr_register);
134                parse_state = ParseState::TableSeparator;
135            }
136            ParseState::TableEnd => {
137                consume_empty_lines(&mut lines);
138                if lines.peek().is_some() {
139                    parse_state = ParseState::SectionStart;
140                } else {
141                    parse_state = ParseState::FileEnd;
142                }
143            }
144            ParseState::FileEnd => break,
145        }
146    }
147
148    Ok(registers)
149}
150
151/// Parse all of the RISCV instruction rst files inside of `docs_dir`
152/// Each file is expected to correspond to part of an `Instruction` object
153///
154/// Current function assumes that the RST file is already read and that it's been given a reference
155/// to its contents (`&str`).
156///
157/// # Errors
158///
159/// This function will return `Err` if an rst file within `docs_path` cannot be parsed,
160/// or if `docs_path` cannot be read
161///
162/// # Panics
163///
164/// Will panic the parser fails to extract an instruction name from a given file
165pub fn populate_riscv_instructions(docs_path: &PathBuf) -> Result<Vec<Instruction>> {
166    let mut instructions_map = HashMap::<String, Instruction>::new();
167
168    // ensure we iterate through all files in a deterministic order
169    let mut entries: Vec<PathBuf> = std::fs::read_dir(docs_path)?
170        .map(|res| res.map(|e| e.path()))
171        .collect::<Result<Vec<_>, std::io::Error>>()?;
172    entries.sort();
173
174    // parse all instruction docs
175    for path in entries {
176        if let Ok(docs) = std::fs::read_to_string(&path) {
177            for instr in parse_riscv_instructions(&docs) {
178                let instr_name = instr.name.to_ascii_lowercase();
179                assert!(!instructions_map.contains_key(&instr_name));
180                instructions_map.insert(instr_name, instr);
181            }
182        }
183    }
184
185    Ok(instructions_map.into_values().collect())
186}
187
188/// Parse an rst file containing the documentation for several RISCV instructions
189///
190/// # Errors
191///
192/// This function is highly specialized to parse a handful of files and will panic or return
193/// `Err` for most mal-formed inputs
194///
195/// # Panics
196///
197/// This function is highly specialized to parse a handful of files and will panic or return
198/// `Err` for most mal-formed/unexpected inputs
199fn parse_riscv_instructions(rst_contents: &str) -> Vec<Instruction> {
200    // We could pull in an actual rst parser to do this, but the files' contents
201    // are straightforward/structured enough that this should be fairly trivial
202    enum ParseState {
203        FileStart,
204        InstructionStart,
205        InstructionTableInfo,
206        InstructionFormat,
207        InstructionDescription,
208        InstructionImplementation,
209        InstructionExpansion,
210        FileEnd,
211    }
212    let mut parse_state = ParseState::FileStart;
213    let mut instructions = Vec::new();
214    let mut curr_instruction = Instruction {
215        arch: Arch::RISCV,
216        ..Default::default()
217    };
218    let mut lines = rst_contents.lines().peekable();
219
220    loop {
221        match parse_state {
222            ParseState::FileStart => {
223                let _header = lines.next().unwrap();
224                let separator = lines.next().unwrap();
225                assert!(separator.trim_ascii().starts_with('='));
226                consume_empty_lines(&mut lines);
227                parse_state = ParseState::InstructionStart;
228            }
229            ParseState::InstructionStart => {
230                curr_instruction.name = lines.next().unwrap().trim_ascii().to_ascii_lowercase();
231                let separator = lines.next().unwrap();
232                // e.g. ----------
233                assert!(separator.trim_ascii().starts_with('-'));
234                consume_empty_lines(&mut lines);
235
236                // some forms have an explanation for the mnemonic before the table section
237                if !lines.peek().unwrap().starts_with("..") {
238                    curr_instruction.summary =
239                        format!("{}\n\n", lines.next().unwrap().trim_ascii());
240                    consume_empty_lines(&mut lines);
241                }
242                parse_state = ParseState::InstructionTableInfo;
243            }
244            ParseState::InstructionTableInfo => {
245                // e.g. .. tabularcolumns:: |c|c|c|c|c|c|c|c|
246                let table_info_1 = lines.next().unwrap();
247                assert!(table_info_1.trim_ascii().starts_with(".."));
248                // e.g. .. table::
249                let table_info_2 = lines.next().unwrap();
250                assert!(table_info_2.trim_ascii().starts_with(".."));
251
252                consume_empty_lines(&mut lines);
253
254                /* e.g.
255                  +-----+--+--+-----+-----+-----+-----+-----+---+
256                  |31-27|26|25|24-20|19-15|14-12|11-7 |6-2  |1-0|
257                  +-----+--+--+-----+-----+-----+-----+-----+---+
258                  |11100|aq|rl|rs2  |rs1  |011  |rd   |01011|11 |
259                  +-----+--+--+-----+-----+-----+-----+-----+---+
260                */
261                let top = lines.next().unwrap();
262                assert!(top.trim_ascii().starts_with('+'));
263                let first_row = lines.next().unwrap();
264                assert!(first_row.trim_ascii().starts_with('|'));
265                let middle = lines.next().unwrap();
266                assert!(middle.trim_ascii().starts_with('+'));
267                let second_row = lines.next().unwrap();
268                assert!(second_row.trim_ascii().starts_with('|'));
269                let bottom = lines.next().unwrap();
270                assert!(bottom.trim_ascii().starts_with('+'));
271                consume_empty_lines(&mut lines);
272                parse_state = ParseState::InstructionFormat;
273            }
274            ParseState::InstructionFormat => {
275                let header = lines.next().unwrap();
276                assert!(header.eq(":Format:"));
277                curr_instruction.asm_templates.push(
278                    lines
279                        .next()
280                        .unwrap()
281                        .trim_ascii()
282                        .trim_start_matches('|')
283                        .trim_ascii()
284                        .to_string(),
285                );
286                consume_empty_lines(&mut lines);
287                parse_state = ParseState::InstructionDescription;
288            }
289            ParseState::InstructionDescription => {
290                let header = lines.next().unwrap();
291                assert!(header.eq(":Description:"));
292                while let Some(next) = lines.peek() {
293                    if next.contains('|') {
294                        curr_instruction.summary += lines
295                            .next()
296                            .unwrap()
297                            .trim_ascii()
298                            .trim_start_matches('|')
299                            .trim_ascii();
300                    } else {
301                        break;
302                    }
303                }
304                consume_empty_lines(&mut lines);
305                parse_state = ParseState::InstructionImplementation;
306            }
307            ParseState::InstructionImplementation => {
308                let header = lines.next().unwrap();
309                assert!(header.eq(":Implementation:"));
310                let _impl_body = lines.next(); // e.g. x[rd] = AMO64(M[x[rs1]] MAXU x[rs2])
311                consume_empty_lines(&mut lines);
312                parse_state = ParseState::InstructionExpansion;
313            }
314            // NOTE: This field isn't present in most files
315            ParseState::InstructionExpansion => {
316                match lines.peek() {
317                    Some(&":Expansion:") => {
318                        let header = lines.next().unwrap();
319                        assert!(header.eq(":Expansion:"));
320                        let _exp_body = lines.next(); // e.g. lw rd\',offset[6:2](rs1\')
321                        consume_empty_lines(&mut lines);
322                        if lines.peek().is_some() {
323                            parse_state = ParseState::InstructionStart;
324                        } else {
325                            parse_state = ParseState::FileEnd;
326                        }
327                    }
328                    Some(other) => {
329                        if other.eq(&".. [classify table]") {
330                            consume_classify_table(&mut lines);
331                        }
332                        if lines.peek().is_some() {
333                            parse_state = ParseState::InstructionStart;
334                        } else {
335                            parse_state = ParseState::FileEnd;
336                        }
337                    }
338                    None => parse_state = ParseState::FileEnd,
339                }
340
341                instructions.push(curr_instruction);
342                curr_instruction = Instruction {
343                    arch: Arch::RISCV,
344                    ..Default::default()
345                };
346            }
347            ParseState::FileEnd => break,
348        }
349    }
350
351    instructions
352}
353
354fn consume_empty_lines(line_iter: &mut Peekable<Lines>) {
355    while let Some(next) = line_iter.peek() {
356        if next.is_empty() {
357            _ = line_iter.next();
358        } else {
359            break;
360        }
361    }
362}
363
364fn consume_classify_table(line_iter: &mut Peekable<Lines>) {
365    let info_1 = line_iter.next().unwrap();
366    assert!(info_1.eq(".. [classify table]"));
367    let info_2 = line_iter.next().unwrap();
368    assert!(info_2.eq(".. table::"));
369    let info_3 = line_iter.next().unwrap();
370    assert!(info_3.trim_ascii().eq("Classify Table:"));
371    let empty = line_iter.next().unwrap();
372    assert!(empty.is_empty());
373    while let Some(next) = line_iter.peek() {
374        if next.is_empty() {
375            break;
376        }
377        _ = line_iter.next();
378    }
379}
380
381/// Parse all of the ARM instruction xml files inside of `docs_dir`
382/// Each file is expected to correspond to part of an `Instruction` object
383///
384/// Current function assumes that the XML file is already read and that it's been given a reference
385/// to its contents (`&str`).
386///
387/// # Errors
388///
389/// This function will return `Err` if an xml file within `docs_path` cannot be parsed,
390/// or if `docs_path` cannot be read
391///
392/// # Panics
393///
394/// Will panic the parser fails to extract an instruction name from a given file
395pub fn populate_arm_instructions(docs_path: &PathBuf) -> Result<Vec<Instruction>> {
396    let mut instructions_map = HashMap::<String, Instruction>::new();
397    let mut alias_map = HashMap::<String, Vec<InstructionAlias>>::new();
398
399    // ensure we iterate through all files in a deterministic order
400    let mut entries: Vec<PathBuf> = std::fs::read_dir(docs_path)?
401        .map(|res| res.map(|e| e.path()))
402        .collect::<Result<Vec<_>, std::io::Error>>()?;
403    entries.sort();
404
405    // parse all instruction and instruction alias docs
406    for path in entries {
407        if path.extension().unwrap_or_default() != "xml"
408            || path.file_stem().unwrap_or_default() == "notice"
409            || path.file_stem().unwrap_or_default() == "constraint_text_mappings"
410            || path.file_stem().unwrap_or_default() == "shared_pseudocode"
411        {
412            continue;
413        }
414        if let Ok(docs) = std::fs::read_to_string(&path) {
415            if let Some((alias, aliased_instr)) = parse_arm_alias(&docs)? {
416                assert!(!aliased_instr.is_empty());
417                let aliases = alias_map.entry(aliased_instr).or_default();
418                aliases.push(alias);
419            } else if let Some(mut instr) = parse_arm_instruction(&docs) {
420                assert!(!instr.name.is_empty());
421                if let Some(entry) = instructions_map.get_mut(&instr.name) {
422                    entry.aliases.append(&mut instr.aliases);
423                    entry.asm_templates.append(&mut instr.asm_templates);
424                    if entry.summary.is_empty() {
425                        entry.summary = instr.summary;
426                    }
427                } else {
428                    instructions_map.insert(instr.name.clone(), instr);
429                }
430            }
431        } else {
432            println!(
433                "WARNING: Skipping entry, could not read file {}",
434                path.display()
435            );
436        }
437    }
438
439    // add aliases to their corresponding instruction, creating them as necessary
440    for (instr_name, aliases) in &mut alias_map {
441        if let Some(entry) = instructions_map.get_mut(instr_name) {
442            entry.aliases.append(aliases);
443        } else {
444            instructions_map.insert(
445                instr_name.to_owned(),
446                Instruction {
447                    name: instr_name.to_owned(),
448                    // TODO:currently changing into either doesn't change
449                    // anything as both source form the 64bit info which should
450                    // change when arm32 info is added
451                    arch: Arch::ARM64,
452                    aliases: aliases.to_owned(),
453                    ..Default::default()
454                },
455            );
456        }
457    }
458
459    Ok(instructions_map.into_values().collect())
460}
461
462/// Parse an xml file containing the documentation for a single ARM instruction
463/// Treats the contents as an instruction alias, and returns `None` if it is not
464///
465/// # Errors
466///
467/// This function is highly specialized to parse a handful of files and will panic or return
468/// `Err` for most mal-formed inputs
469///
470/// # Panics
471///
472/// This function is highly specialized to parse a handful of files and will panic or return
473/// `Err` for most mal-formed/unexpected inputs
474fn parse_arm_alias(xml_contents: &str) -> Result<Option<(InstructionAlias, String)>> {
475    // iterate through the XML
476    let mut reader = Reader::from_str(xml_contents);
477    let mut aliased_instr: Option<String> = None;
478    let mut alias = InstructionAlias::default();
479    let mut curr_template: Option<String> = None;
480    let mut in_desc = false;
481    let mut in_para = false;
482    let mut in_template = false;
483
484    loop {
485        match reader.read_event() {
486            Ok(Event::Start(ref e)) => match e.name() {
487                QName(b"instructionsection") => {
488                    for attr in e.attributes() {
489                        let Attribute { key, value } = attr.unwrap();
490                        if b"title" == key.into_inner() {
491                            alias.title = ustr::get_str(&value).to_string();
492                        }
493                    }
494                }
495                QName(b"desc") => in_desc = true,
496                QName(b"para") => in_para = true,
497                QName(b"asmtemplate") => in_template = true,
498                QName(b"alphaindex" | b"encodingindex") => return Ok(None),
499                _ => {}
500            },
501            Ok(Event::Text(ref txt)) => {
502                if in_template {
503                    let cleaned = txt.unescape().unwrap();
504                    if let Some(existing) = curr_template {
505                        curr_template = Some(format!("{existing}{cleaned}"));
506                    } else {
507                        let mut new_template = cleaned.into_owned().trim_ascii().to_owned();
508                        new_template.push(' ');
509                        curr_template = Some(new_template);
510                    }
511                } else if in_desc && in_para && alias.summary.is_empty() {
512                    ustr::get_str(txt).clone_into(&mut alias.summary);
513                }
514            }
515            Ok(Event::Empty(ref e)) => {
516                if QName(b"docvar") == e.name() {
517                    let mut alias_next = false;
518                    for attr in e.attributes() {
519                        let Attribute { key, value } = attr.unwrap();
520                        // TODO: we can get the correct alias from the id of an alias mnemonic
521                        // else the actual alias is the last docvar in the docvars tag
522                        if alias_next && b"value" == key.into_inner() {
523                            aliased_instr = Some(ustr::get_str(&value).to_ascii_lowercase());
524                            break;
525                        }
526                        if b"key" == key.into_inner()
527                            && b"alias_mnemonic" == ustr::get_str(&value).as_bytes()
528                        {
529                            alias_next = true;
530                        }
531                    }
532                }
533            }
534            // end event
535            Ok(Event::End(ref e)) => match e.name() {
536                QName(b"instructionsection") => break,
537                QName(b"asmtemplate") => {
538                    if let Some(template) = curr_template.take() {
539                        alias.asm_templates.push(template);
540                    }
541                    in_template = false;
542                }
543                QName(b"docvars") => {
544                    if aliased_instr.is_none() {
545                        return Ok(None);
546                    }
547                }
548                _ => {}
549            },
550            _ => {}
551        }
552    }
553
554    aliased_instr.map_or_else(|| Ok(None), |aliased_name| Ok(Some((alias, aliased_name))))
555}
556
557/// Parse an xml file containing the documentation for a single ARM instruction
558///
559/// # Errors
560///
561/// This function is highly specialized to parse a handful of files and will panic or return
562/// `Err` for most mal-formed inputs
563///
564/// # Panics
565///
566/// This function is highly specialized to parse a handful of files and will panic or return
567/// `Err` for most mal-formed/unexpected inputs
568fn parse_arm_instruction(xml_contents: &str) -> Option<Instruction> {
569    // iterate through the XML
570    let mut reader = Reader::from_str(xml_contents);
571
572    // ref to the instruction that's currently under construction
573    let mut instruction = Instruction {
574        // TODO: switch for archs
575        arch: Arch::ARM64,
576        ..Default::default()
577    };
578    let mut curr_template: Option<String> = None;
579    let mut in_desc = false;
580    let mut in_para = false;
581    let mut in_template = false;
582
583    loop {
584        match reader.read_event() {
585            Ok(Event::Start(ref e)) => match e.name() {
586                QName(b"desc") => in_desc = true,
587                QName(b"para") => in_para = true,
588                QName(b"asmtemplate") => in_template = true,
589                QName(b"alphaindex" | b"encodingindex") => return None,
590                _ => {}
591            },
592            Ok(Event::Empty(ref e)) => {
593                // e.g. <docvar key="mnemonic" value="ABS"/>
594                if QName(b"docvar") == e.name() {
595                    // There are multiple entries like this in each opcode file, but
596                    // *all* of them are the same within each file, so it doesn't matter which
597                    // one we use
598                    if instruction.name.is_empty() {
599                        let mut mnemonic_next = false;
600                        for attr in e.attributes() {
601                            let Attribute { key: _, value } = attr.unwrap();
602                            if b"mnemonic" == ustr::get_str(&value).as_bytes() {
603                                mnemonic_next = true;
604                            } else if mnemonic_next {
605                                instruction.name = ustr::get_str(&value).to_ascii_lowercase();
606                                break;
607                            }
608                        }
609                    }
610                }
611            }
612            Ok(Event::Text(ref txt)) => {
613                if in_template {
614                    let cleaned = txt.unescape().unwrap();
615                    if let Some(existing) = curr_template {
616                        curr_template = Some(format!("{existing}{cleaned}"));
617                    } else {
618                        let mut new_template = cleaned.into_owned().trim_ascii().to_owned();
619                        new_template.push(' ');
620                        curr_template = Some(new_template);
621                    }
622                } else if in_desc && in_para && instruction.summary.is_empty() {
623                    ustr::get_str(txt).clone_into(&mut instruction.summary);
624                }
625            }
626            // end event
627            Ok(Event::End(ref e)) => {
628                match e.name() {
629                    QName(b"instructionsection") => break,
630                    QName(b"encoding") => {
631                        if let Some(template) = curr_template.take() {
632                            instruction.asm_templates.push(template);
633                        }
634                    }
635                    QName(b"desc") => in_desc = false,
636                    QName(b"para") => in_para = false,
637                    QName(b"asmtemplate") => in_template = false,
638                    _ => {} // unknown event
639                }
640            }
641            Ok(Event::Eof) => break,
642            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
643            _ => {} // rest of events that we don't consider
644        }
645    }
646
647    Some(instruction)
648}
649
650/// Parse all of the MARS mips pseudo-ops from the `mips.txt` file
651///
652/// # Errors
653///
654/// This function will return `Err` if `contents` cannot be parsed
655///
656/// # Panics
657///
658/// This function is highly specialized to parse a single file and will panic if the file is not
659/// in the expected format or if it contains unexpected content
660///
661/// <https://github.com/dpetersanderson/MARS/blob/main/PseudoOps.txt>
662pub fn populate_mars_pseudo_instructions(contents: &str) -> Result<Vec<Instruction>> {
663    let mut prev_instr: Option<&mut Instruction> = None;
664    let mut instructions = Vec::new();
665
666    for line in contents
667        .lines()
668        .filter(|l| !l.is_empty() && !l.trim_start().starts_with('#'))
669        .map(str::trim)
670    {
671        let name = line.split_once(' ').unwrap().0;
672        let (_, description) = line.split_once('#').unwrap();
673        let template = line.replace('\t', " ");
674        match prev_instr {
675            Some(ref mut prev) if prev.name == name => {
676                prev.asm_templates.push(template);
677                continue;
678            }
679            _ => {}
680        }
681
682        let mut summary = description.trim().replace('\t', " ");
683        if let Some(colon_idx) = summary.find(':') {
684            // Only keep common description between psuedo op definitions
685            summary = summary[..colon_idx].trim().to_string();
686        }
687
688        instructions.push(Instruction {
689            name: name.to_string(),
690            summary: format!("{summary}\n\nPseudo-op provided by the MARS assembler",),
691            asm_templates: vec![template],
692            arch: Arch::Mips,
693            forms: Vec::new(),
694            aliases: Vec::new(),
695            url: None,
696        });
697
698        prev_instr = instructions.last_mut();
699    }
700
701    Ok(instructions)
702}
703
704/// Parse all of the mips instruction in the `mips.json` file
705///
706/// # Errors
707///
708/// This function will return `Err` if `json_contents` cannot be parsed
709pub fn populate_mips_instructions(json_contents: &str) -> Result<Vec<Instruction>> {
710    #[derive(Deserialize, Debug)]
711    struct MipsInstruction {
712        pub name: String,
713        pub summary: String,
714        pub asm_templates: Vec<String>,
715    }
716
717    impl From<MipsInstruction> for Instruction {
718        fn from(instr: MipsInstruction) -> Self {
719            Self {
720                name: instr.name.to_ascii_lowercase(),
721                summary: instr.summary,
722                asm_templates: instr.asm_templates,
723                arch: Arch::Mips,
724                forms: Vec::new(),
725                aliases: Vec::new(),
726                url: Some(
727                    "https://www.cs.cornell.edu/courses/cs3410/2008fa/MIPS_Vol2.pdf".to_string(),
728                ),
729            }
730        }
731    }
732
733    let raw_instrs: Vec<MipsInstruction> =
734        serde_json::from_str(json_contents).map_err(|e| anyhow!("Failed to parse JSON: {e}"))?;
735    let instructions: Vec<Instruction> = raw_instrs.into_iter().map(Instruction::from).collect();
736
737    Ok(instructions)
738}
739
740/// Parse the provided HTML contents and return a vector of all the instructions based on that.
741/// <https://www.masswerk.at/6502/6502_instruction_set.html>
742///
743/// # Errors
744///
745/// This function is highly specialized to parse a single file and will panic or return
746/// `Err` for most mal-formed inputs
747///
748/// # Panics
749///
750/// This function is highly specialized to parse a single file and will panic or return
751/// `Err` for most mal-formed/unexpected inputs
752// NOTE: We could use an HTML parsing library like scraper or html5ever, but the input
753// is regular/constrained enough that we can just use some regexes and avoid
754// the extra dependency
755pub fn populate_6502_instructions(html_conts: &str) -> Result<Vec<Instruction>> {
756    let name_regex = Regex::new(r#"<dt id="[A-Z]{3}">(?<name>[A-Z]{3})</dt>$"#).unwrap();
757    let summary_regex = Regex::new(r#"<p aria-label="summary">(?<summary>.+)</p>$"#).unwrap();
758    let mut instructions = Vec::new();
759    let start = {
760        let start_marker = r#"<dl class="opcodes">"#;
761        let section_start = html_conts.find(start_marker).unwrap();
762        section_start + start_marker.len() + 1 // + 1 for '\n'
763    };
764    let mut lines = html_conts[start..].lines().peekable();
765    loop {
766        // opcode id
767        let Some(name_line) = lines.next() else {
768            break;
769        };
770        if name_line.is_empty() {
771            continue;
772        }
773        let name = &name_regex.captures(name_line).unwrap()["name"];
774        assert_eq!(lines.next().unwrap(), "<dd>");
775        // summary
776        let mut summary =
777            summary_regex.captures(lines.next().unwrap()).unwrap()["summary"].to_string();
778        let implementation_notes_marker = r#"<p aria-label="notes on the implementation">"#;
779        let synopsis_marker = r#"<p aria-label="synopsis">"#;
780        if lines
781            .peek()
782            .unwrap()
783            .starts_with(implementation_notes_marker)
784        {
785            summary.push('\n');
786            while !lines.peek().unwrap().starts_with(synopsis_marker) {
787                summary += &lines
788                    .next()
789                    .unwrap()
790                    .replace(r#"<p aria-label="notes on the implementation">"#, "")
791                    .replace("<br />", "")
792                    .replace("</p>", "");
793            }
794        }
795        // synopsis
796        let synopsis_line = lines.next().unwrap();
797        let mut synopsis = String::new();
798        let mut prev_idx = 0;
799        for (i, c) in synopsis_line.chars().enumerate() {
800            match c {
801                '<' => {
802                    if prev_idx != 0 {
803                        let bytes: Vec<u8> = synopsis_line.as_bytes()[prev_idx..i].to_vec();
804                        let decoded = htmlentity::entity::decode(&bytes).to_string().unwrap();
805                        synopsis += &decoded;
806                    }
807                }
808                '>' => prev_idx = i + 1,
809                _ => {}
810            }
811        }
812        // flags
813        assert_eq!(
814            r#"<table aria-label="flags">"#,
815            lines.next().unwrap().trim()
816        );
817        // This is always the same
818        assert_eq!(
819            r"<tr><th>N</th><th>Z</th><th>C</th><th>I</th><th>D</th><th>V</th></tr>",
820            lines.next().unwrap().trim()
821        );
822        let flag_line = lines.next().unwrap().trim();
823        let flags: String = if flag_line.contains("from stack") {
824            "from stack".to_string()
825        } else {
826            flag_line
827                .chars()
828                .skip("<tr><td>".len())
829                .step_by("</td><td>".len() + 1)
830                .take(6) // N, Z, C, I, D, V
831                .collect()
832        };
833        assert!(
834            flags.len() == 6 || flags.eq("from stack"),
835            "name: {name}, flagline: {flag_line}"
836        );
837        assert_eq!("</table>", lines.next().unwrap().trim());
838        // details (table)
839        assert_eq!(
840            r#"<table aria-label="details">"#,
841            lines.next().unwrap().trim()
842        );
843        let mut templates = Vec::new();
844        assert_eq!(
845            r"<tr><th>addressing</th><th>assembler</th><th>opc</th><th>bytes</th><th>cycles</th></tr>",
846            lines.next().unwrap().trim()
847        );
848        loop {
849            let next = lines.next().unwrap().trim();
850            if next.eq("</table>") {
851                break;
852            }
853            let template_marker = "</td><td>";
854            let start_idx = next.find(template_marker).unwrap() + template_marker.len();
855            let end_offset = next[start_idx..].find(template_marker).unwrap();
856            templates.push(next[start_idx..start_idx + end_offset].to_string());
857        }
858        assert_eq!("</dd>", lines.next().unwrap().trim());
859        let combined_summary = format!("{summary}\n{synopsis}\nNZCIDV\n`{flags}`");
860        instructions.push(Instruction {
861            name: name.to_lowercase(),
862            summary: combined_summary,
863            forms: Vec::new(),
864            asm_templates: templates,
865            aliases: Vec::new(),
866            arch: Arch::MOS6502,
867            url: Some(format!(
868                "https://www.masswerk.at/6502/6502_instruction_set.html#{}",
869                name.to_uppercase()
870            )),
871        });
872        if name.eq("TYA") {
873            break;
874        }
875    }
876
877    Ok(instructions)
878}
879
880/// Parse the provided JSON contents and return a vector of all the instructions based on that.
881/// <https://github.com/open-power-sdk/PowerISA/blob/main/ISA.json>
882///
883/// # Errors
884///
885/// This function is highly specialized to parse a single file and will panic or return
886/// `Err` for most mal-formed inputs
887///
888/// # Panics
889///
890/// This function is highly specialized to parse a single file and will panic or return
891/// `Err` for most mal-formed/unexpected inputs
892// NOTE:
893// Raw JSON file pruned via the command:
894// ```
895// jq ".instructions | map({mnemonics: .mnemonics | map(del(.intrinsics)), body})" power-isa.json
896// ```
897pub fn populate_power_isa_instructions(json_conts: &str) -> Result<Vec<Instruction>> {
898    #[allow(non_camel_case_types, clippy::upper_case_acronyms)]
899    #[derive(Deserialize, Debug, Copy, Clone)]
900    enum PowerReleaseRepr {
901        P1,
902        P2,
903        PPC,
904        #[serde(rename = "v2.00")]
905        v200,
906        #[serde(rename = "v2.01")]
907        v201,
908        #[serde(rename = "v2.02")]
909        v202,
910        #[serde(rename = "v2.03")]
911        v203,
912        #[serde(rename = "v2.04")]
913        v204,
914        #[serde(rename = "v2.05")]
915        v205,
916        #[serde(rename = "v2.06")]
917        v206,
918        #[serde(rename = "v2.07")]
919        v207,
920        #[serde(rename = "v3.0")]
921        v30,
922        #[serde(rename = "v3.0B")]
923        v30B,
924        #[serde(rename = "v3.0C")]
925        v30C,
926        #[serde(rename = "v3.1")]
927        v31,
928        #[serde(rename = "v3.1B")]
929        v31B,
930    }
931
932    impl PowerReleaseRepr {
933        fn release_message(self) -> String {
934            String::from(match self {
935                Self::P1 => "Introduced in POWER Architecture",
936                Self::P2 => "Introduced in POWER2 Architecture",
937                Self::PPC => "Introduced in PowerPC Architecture prior to v2.00",
938                Self::v200 => "Introduced in PowerPC Architecture Version 2.00",
939                Self::v201 => "Introduced in PowerPC Architecture Version 2.01",
940                Self::v202 => "Introduced in PowerPC Architecture Version 2.02",
941                Self::v203 => "Introduced in Power ISA Version 2.03",
942                Self::v204 => "Introduced in Power ISA Version 2.04",
943                Self::v205 => "Introduced in Power ISA Version 2.05",
944                Self::v206 => "Introduced in Power ISA Version 2.06",
945                Self::v207 => "Introduced in Power ISA Version 2.07",
946                Self::v30 => "Introduced in Power ISA Version 3.0",
947                Self::v30B => "Introduced in Power ISA Version 3.0B",
948                Self::v30C => "Introduced in Power ISA Version 3.0C",
949                Self::v31 => "Introduced in Power ISA Version 3.1",
950                Self::v31B => "Introduced in Power ISA Version 3.1B",
951            })
952        }
953    }
954    #[allow(dead_code)]
955    #[derive(Deserialize, Debug)]
956    struct PowerConditionRepr {
957        pub field: String,
958        pub value: String,
959    }
960    #[allow(dead_code)]
961    #[derive(Deserialize, Debug)]
962    struct PowerLayoutRepr {
963        pub name: String,
964        pub size: String,
965    }
966    #[allow(dead_code)]
967    #[derive(Deserialize, Debug)]
968    struct PowerMnemonicRepr {
969        pub name: String,
970        pub form: String,
971        pub mnemonic: String,
972        pub operands: Vec<String>,
973        pub conditions: Vec<PowerConditionRepr>,
974        pub layout: Vec<PowerLayoutRepr>,
975        pub release: PowerReleaseRepr,
976    }
977    #[derive(Deserialize, Debug)]
978    struct PowerJsonRepr {
979        pub mnemonics: Vec<PowerMnemonicRepr>,
980        pub body: Vec<String>,
981    }
982
983    impl From<PowerJsonRepr> for Vec<Instruction> {
984        fn from(value: PowerJsonRepr) -> Self {
985            let mut instructions = Self::new();
986            for op in value.mnemonics {
987                let name = op.mnemonic.trim();
988                let mut instruction = Instruction {
989                    arch: Arch::PowerISA,
990                    name: name.to_string(),
991                    ..Default::default()
992                };
993                instruction.summary = {
994                    let operands = op.operands.iter().fold(String::new(), |accum, x| {
995                        format!("{} + `{x}`", if accum.is_empty() { "" } else { "\n" })
996                    });
997                    let description = value.body.join("\n");
998
999                    format!(
1000                        "\n{} ({})\n\n{operands}\n{description}",
1001                        op.name,
1002                        op.release.release_message(),
1003                    )
1004                };
1005                instructions.push(instruction);
1006            }
1007
1008            instructions
1009        }
1010    }
1011
1012    let json_instrs: Vec<PowerJsonRepr> = serde_json::from_str(json_conts)?;
1013    let mut instructions = Vec::new();
1014    for instr in json_instrs {
1015        instructions.append(&mut instr.into());
1016    }
1017
1018    Ok(instructions)
1019}
1020
1021/// Parse the provided XML contents and return a vector of all the instructions based on that.
1022/// If parsing fails, the appropriate error will be returned instead.
1023///
1024/// Current function assumes that the XML file is already read and that it's been given a reference
1025/// to its contents (`&str`).
1026///
1027/// # Errors
1028///
1029/// This function is highly specialized to parse a handful of files and will panic or return
1030/// `Err` for most mal-formed inputs
1031///
1032/// # Panics
1033///
1034/// This function is highly specialized to parse a handful of files and will panic or return
1035/// `Err` for most mal-formed/unexpected inputs
1036pub fn populate_instructions(xml_contents: &str) -> Result<Vec<Instruction>> {
1037    // initialise the instruction set
1038    let mut instructions_map = HashMap::<String, Instruction>::new();
1039
1040    // iterate through the XML
1041    let mut reader = Reader::from_str(xml_contents);
1042
1043    // ref to the instruction that's currently under construction
1044    let mut curr_instruction = Instruction::default();
1045    let mut curr_instruction_form = InstructionForm::default();
1046    let mut arch: Arch = Arch::None;
1047
1048    loop {
1049        match reader.read_event() {
1050            // start event
1051            Ok(Event::Start(ref e)) => {
1052                match e.name() {
1053                    QName(b"InstructionSet") => {
1054                        for attr in e.attributes() {
1055                            let Attribute { key, value } = attr.unwrap();
1056                            if b"name" == key.into_inner() {
1057                                arch = Arch::from_str(ustr::get_str(&value)).unwrap_or_else(|e| {
1058                                    panic!("Failed parse Arch {} -- {e}", ustr::get_str(&value))
1059                                });
1060                            } else {
1061                                panic!("Failed to parse architecture name -- no name value");
1062                            }
1063                        }
1064                    }
1065                    QName(b"Instruction") => {
1066                        // start of a new instruction
1067                        curr_instruction = Instruction::default();
1068                        curr_instruction.arch = arch;
1069
1070                        // iterate over the attributes
1071                        for attr in e.attributes() {
1072                            let Attribute { key, value } = attr.unwrap();
1073                            match ustr::get_str(key.into_inner()) {
1074                                "name" => {
1075                                    let name = ustr::get_str(&value);
1076                                    curr_instruction.name = name.to_ascii_lowercase();
1077                                }
1078                                "summary" => {
1079                                    ustr::get_str(&value).clone_into(&mut curr_instruction.summary);
1080                                }
1081                                _ => {}
1082                            }
1083                        }
1084                    }
1085                    QName(b"InstructionForm") => {
1086                        // Read the attributes
1087                        //
1088                        // <xs:attribute name="gas-name" type="xs:string" use="required" />
1089                        // <xs:attribute name="go-name" type="xs:string" />
1090                        // <xs:attribute name="mmx-mode" type="MMXMode" />
1091                        // <xs:attribute name="xmm-mode" type="XMMMode" />
1092                        // <xs:attribute name="cancelling-inputs" type="xs:boolean" />
1093                        // <xs:attribute name="nacl-version" type="NaClVersion" />
1094                        // <xs:attribute name="nacl-zero-extends-outputs" type="xs:boolean" />
1095
1096                        // new instruction form
1097                        curr_instruction_form = InstructionForm::default();
1098
1099                        // iterate over the attributes
1100                        for attr in e.attributes() {
1101                            let Attribute { key, value } = attr.unwrap();
1102                            match ustr::get_str(key.into_inner()) {
1103                                "gas-name" => {
1104                                    curr_instruction_form.gas_name =
1105                                        Some(ustr::get_str(&value).to_owned());
1106                                }
1107                                "go-name" => {
1108                                    curr_instruction_form.go_name =
1109                                        Some(ustr::get_str(&value).to_owned());
1110                                }
1111                                "mmx-mode" => {
1112                                    let value_ = value.as_ref();
1113                                    curr_instruction_form.mmx_mode =
1114                                        Some(MMXMode::from_str(ustr::get_str(value_))?);
1115                                }
1116                                "xmm-mode" => {
1117                                    let value_ = value.as_ref();
1118                                    curr_instruction_form.xmm_mode =
1119                                        Some(XMMMode::from_str(ustr::get_str(value_))?);
1120                                }
1121                                "cancelling-inputs" => match ustr::get_str(&value) {
1122                                    "true" => curr_instruction_form.cancelling_inputs = Some(true),
1123                                    "false" => {
1124                                        curr_instruction_form.cancelling_inputs = Some(false);
1125                                    }
1126                                    val => {
1127                                        return Err(anyhow!(
1128                                            "Unknown value '{val}' for XML attribute cancelling inputs"
1129                                        ));
1130                                    }
1131                                },
1132                                "nacl-version" => {
1133                                    curr_instruction_form.nacl_version =
1134                                        value.as_ref().first().copied();
1135                                }
1136                                "nacl-zero-extends-outputs" => match ustr::get_str(&value) {
1137                                    "true" => {
1138                                        curr_instruction_form.nacl_zero_extends_outputs =
1139                                            Some(true);
1140                                    }
1141                                    "false" => {
1142                                        curr_instruction_form.nacl_zero_extends_outputs =
1143                                            Some(false);
1144                                    }
1145                                    val => {
1146                                        return Err(anyhow!(
1147                                            "Unknown value '{val}' for XML attribute nacl-zero-extends-outputs",
1148                                        ));
1149                                    }
1150                                },
1151                                "z80name" => {
1152                                    curr_instruction_form.z80_name =
1153                                        Some(ustr::get_str(&value).to_owned());
1154                                }
1155                                "form" => {
1156                                    let value_ = ustr::get_str(&value);
1157                                    curr_instruction_form.urls.push(format!(
1158                                        "https://www.zilog.com/docs/z80/z80cpu_um.pdf#{}",
1159                                        encode_www_form_urlencoded(value_)
1160                                    ));
1161                                    curr_instruction_form.z80_form = Some(value_.to_string());
1162                                }
1163                                _ => {}
1164                            }
1165                        }
1166                    }
1167                    // TODO
1168                    QName(b"Encoding") => {
1169                        for attr in e.attributes() {
1170                            let Attribute { key, value } = attr.unwrap();
1171                            if key.into_inner() == b"byte" {
1172                                let disp_code = ustr::get_str(&value);
1173                                if let Some(ref mut opcodes) = curr_instruction_form.z80_opcode {
1174                                    opcodes.push_str(disp_code);
1175                                } else {
1176                                    curr_instruction_form.z80_opcode = Some(disp_code.to_owned());
1177                                }
1178                            }
1179                        }
1180                    }
1181                    _ => {} // unknown event
1182                }
1183            }
1184            Ok(Event::Empty(ref e)) => {
1185                match e.name() {
1186                    QName(b"ISA") => {
1187                        for attr in e.attributes() {
1188                            let Attribute { key, value } = attr.unwrap();
1189                            if key.into_inner() == b"id" {
1190                                curr_instruction_form.isa = Some(
1191                                    ISA::from_str(ustr::get_str(value.as_ref())).unwrap_or_else(
1192                                        |_| {
1193                                            panic!(
1194                                                "Unexpected ISA variant {}",
1195                                                ustr::get_str(&value)
1196                                            )
1197                                        },
1198                                    ),
1199                                );
1200                            }
1201                        }
1202                    }
1203                    QName(b"Operand") => {
1204                        let mut type_ = OperandType::k; // dummy initialisation
1205                        let mut extended_size = None;
1206                        let mut input = None;
1207                        let mut output = None;
1208
1209                        for attr in e.attributes() {
1210                            let Attribute { key, value } = attr.unwrap();
1211                            match key.into_inner() {
1212                                b"type" => {
1213                                    type_ = match OperandType::from_str(ustr::get_str(&value)) {
1214                                        Ok(op_type) => op_type,
1215                                        Err(_) => {
1216                                            return Err(anyhow!(
1217                                                "Unknown value for operand type -- Variant: {}",
1218                                                ustr::get_str(&value)
1219                                            ));
1220                                        }
1221                                    }
1222                                }
1223                                b"input" => match value.as_ref() {
1224                                    b"true" => input = Some(true),
1225                                    b"false" => input = Some(false),
1226                                    _ => return Err(anyhow!("Unknown value for operand type")),
1227                                },
1228                                b"output" => match value.as_ref() {
1229                                    b"true" => output = Some(true),
1230                                    b"false" => output = Some(false),
1231                                    _ => return Err(anyhow!("Unknown value for operand type")),
1232                                },
1233                                b"extended-size" => {
1234                                    extended_size =
1235                                        Some(ustr::get_str(value.as_ref()).parse::<usize>()?);
1236                                }
1237                                _ => {} // unknown event
1238                            }
1239                        }
1240
1241                        curr_instruction_form.operands.push(Operand {
1242                            type_,
1243                            input,
1244                            output,
1245                            extended_size,
1246                        });
1247                    }
1248                    QName(b"TimingZ80") => {
1249                        for attr in e.attributes() {
1250                            let Attribute { key, value } = attr.unwrap();
1251                            if key.into_inner() == b"value" {
1252                                let z80 = match Z80TimingInfo::from_str(ustr::get_str(&value)) {
1253                                    Ok(timing) => timing,
1254                                    Err(e) => return Err(anyhow!(e)),
1255                                };
1256                                if let Some(ref mut timing_entry) = curr_instruction_form.z80_timing
1257                                {
1258                                    timing_entry.z80 = z80;
1259                                } else {
1260                                    curr_instruction_form.z80_timing = Some(Z80Timing {
1261                                        z80,
1262                                        ..Default::default()
1263                                    });
1264                                }
1265                            }
1266                        }
1267                    }
1268                    QName(b"TimingZ80M1") => {
1269                        for attr in e.attributes() {
1270                            let Attribute { key, value } = attr.unwrap();
1271                            if key.into_inner() == b"value" {
1272                                let z80_plus_m1 =
1273                                    match Z80TimingInfo::from_str(ustr::get_str(&value)) {
1274                                        Ok(timing) => timing,
1275                                        Err(e) => return Err(anyhow!(e)),
1276                                    };
1277                                if let Some(ref mut timing_entry) = curr_instruction_form.z80_timing
1278                                {
1279                                    timing_entry.z80_plus_m1 = z80_plus_m1;
1280                                } else {
1281                                    curr_instruction_form.z80_timing = Some(Z80Timing {
1282                                        z80_plus_m1,
1283                                        ..Default::default()
1284                                    });
1285                                }
1286                            }
1287                        }
1288                    }
1289                    QName(b"TimingR800") => {
1290                        for attr in e.attributes() {
1291                            let Attribute { key, value } = attr.unwrap();
1292                            if key.into_inner() == b"value" {
1293                                let r800 = match Z80TimingInfo::from_str(ustr::get_str(&value)) {
1294                                    Ok(timing) => timing,
1295                                    Err(e) => return Err(anyhow!(e)),
1296                                };
1297                                if let Some(ref mut timing_entry) = curr_instruction_form.z80_timing
1298                                {
1299                                    timing_entry.r800 = r800;
1300                                } else {
1301                                    curr_instruction_form.z80_timing = Some(Z80Timing {
1302                                        r800,
1303                                        ..Default::default()
1304                                    });
1305                                }
1306                            }
1307                        }
1308                    }
1309                    QName(b"TimingR800Wait") => {
1310                        for attr in e.attributes() {
1311                            let Attribute { key, value } = attr.unwrap();
1312                            if key.into_inner() == b"value" {
1313                                let r800_plus_wait =
1314                                    match Z80TimingInfo::from_str(ustr::get_str(&value)) {
1315                                        Ok(timing) => timing,
1316                                        Err(e) => return Err(anyhow!(e)),
1317                                    };
1318                                if let Some(ref mut timing_entry) = curr_instruction_form.z80_timing
1319                                {
1320                                    timing_entry.r800_plus_wait = r800_plus_wait;
1321                                } else {
1322                                    curr_instruction_form.z80_timing = Some(Z80Timing {
1323                                        r800_plus_wait,
1324                                        ..Default::default()
1325                                    });
1326                                }
1327                            }
1328                        }
1329                    }
1330                    _ => {} // unknown event
1331                }
1332            }
1333            // end event
1334            Ok(Event::End(ref e)) => {
1335                match e.name() {
1336                    QName(b"Instruction") => {
1337                        // finish instruction
1338                        assert!(curr_instruction.arch != Arch::None);
1339                        instructions_map
1340                            .insert(curr_instruction.name.clone(), curr_instruction.clone());
1341                    }
1342                    QName(b"InstructionForm") => {
1343                        curr_instruction.push_form(curr_instruction_form.clone());
1344                    }
1345                    _ => {} // unknown event
1346                }
1347            }
1348            Ok(Event::Eof) => break,
1349            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
1350            _ => {} // rest of events that we don't consider
1351        }
1352    }
1353
1354    if matches!(arch, Arch::X86 | Arch::X86_64) {
1355        let x86_online_docs = get_x86_docs_url();
1356        let body = get_docs_body(&x86_online_docs).unwrap_or_default();
1357        let body_it = body.split("<td>").skip(1).step_by(2);
1358
1359        // Parse this x86 page, grab the contents of the table + the URLs they are referring to
1360        // Regex to match:
1361        // <a href="./VSCATTERPF1DPS:VSCATTERPF1QPS:VSCATTERPF1DPD:VSCATTERPF1QPD.html">VSCATTERPF1QPS</a></td>
1362        //
1363        // let re = Regex::new(r"<a href=\"./(.*)">(.*)</a></td>")?;
1364        // let re = Regex::new(r#"<a href="\./(.*?\.html)">(.*?)</a>.*</td>"#)?;
1365        // let re = Regex::new(r"<a href='\/(.*?)'>(.*?)<\/a>.*<\/td>")?;
1366        let re = Regex::new(r"<a href='\/x86\/(.*?)'>(.*?)<\/a>.*<\/td>")?;
1367        for line in body_it {
1368            // take it step by step.. match a small portion of the line first...
1369            let caps = re.captures(line).unwrap();
1370            let url_suffix = caps.get(1).map_or("", |m| m.as_str());
1371            let instruction_name = caps.get(2).map_or("", |m| m.as_str());
1372
1373            // add URL to the corresponding instruction
1374            if let Some(instruction) = instructions_map.get_mut(instruction_name) {
1375                instruction.url = Some(x86_online_docs.clone() + url_suffix);
1376            }
1377        }
1378    }
1379
1380    Ok(instructions_map.into_values().collect())
1381}
1382
1383pub fn populate_name_to_instruction_map(
1384    arch: Arch,
1385    instructions: &Vec<Instruction>,
1386    names_to_instructions: &mut NameToInstructionMap,
1387) {
1388    for instruction in instructions {
1389        names_to_instructions.insert((arch, instruction.name.clone()), instruction.clone());
1390        // Inserts instruction form names in addition to the instruction's "main"
1391        // name
1392        for name in &instruction.get_associated_names() {
1393            names_to_instructions
1394                .entry((arch, (*name).to_string()))
1395                .or_insert_with(|| instruction.clone());
1396        }
1397    }
1398}
1399
1400fn process_sreg_value(
1401    e: &BytesStart,
1402    curr_instruction_form: &mut InstructionForm,
1403    field_setter: impl FnOnce(&mut AvrStatusRegister, char),
1404) {
1405    for attr in e.attributes() {
1406        let Attribute { key, value } = attr.unwrap();
1407        if key.into_inner() == b"value" {
1408            let val = ustr::get_str(&value);
1409            let status = if val.eq("–") {
1410                '-'
1411            } else {
1412                ustr::get_str(&value)
1413                    .chars()
1414                    .next()
1415                    .expect("Empty status register value")
1416            };
1417            if let Some(ref mut sreg_entry) = curr_instruction_form.avr_status_register {
1418                field_setter(sreg_entry, status);
1419            } else {
1420                let mut sreg = AvrStatusRegister::default();
1421                field_setter(&mut sreg, status);
1422                curr_instruction_form.avr_status_register = Some(sreg);
1423            }
1424            break;
1425        }
1426    }
1427}
1428
1429fn process_clock_value(
1430    e: &BytesStart,
1431    curr_instruction_form: &mut InstructionForm,
1432    field_setter: impl FnOnce(&mut AvrTiming, Option<String>),
1433) {
1434    for attr in e.attributes() {
1435        let Attribute { key, value } = attr.unwrap();
1436        if key.into_inner() == b"value" {
1437            let cycles = Some(ustr::get_str(&value).to_string());
1438            if let Some(ref mut timing_entry) = curr_instruction_form.avr_timing {
1439                field_setter(timing_entry, cycles);
1440            } else {
1441                let mut timing = AvrTiming::default();
1442                field_setter(&mut timing, cycles);
1443                curr_instruction_form.avr_timing = Some(timing);
1444            }
1445            break;
1446        }
1447    }
1448}
1449
1450/// Parse the provided XML contents and return a vector of all the instructions based on that.
1451/// If parsing fails, the appropriate error will be returned instead.
1452///
1453/// Current function assumes that the XML file is already read and that it's been given a reference
1454/// to its contents (`&str`).
1455///
1456/// # Errors
1457///
1458/// This function is highly specialized to parse a handful of files and will panic or return
1459/// `Err` for most mal-formed inputs
1460///
1461/// # Panics
1462///
1463/// This function is highly specialized to parse a handful of files and will panic or return
1464/// `Err` for most mal-formed/unexpected inputs
1465pub fn populate_avr_instructions(xml_contents: &str) -> Result<Vec<Instruction>> {
1466    // initialise the instruction set
1467    let mut instructions_map = HashMap::<String, Instruction>::new();
1468
1469    // iterate through the XML
1470    let mut reader = Reader::from_str(xml_contents);
1471
1472    // ref to the instruction that's currently under construction
1473    let mut curr_instruction = Instruction::default();
1474    let mut curr_instruction_form = InstructionForm::default();
1475    let mut arch: Arch = Arch::None;
1476    let mut curr_version: Option<String> = None;
1477
1478    loop {
1479        match reader.read_event() {
1480            // start event
1481            Ok(Event::Start(ref e)) => {
1482                match e.name() {
1483                    QName(b"InstructionSet") => {
1484                        for attr in e.attributes() {
1485                            let Attribute { key, value } = attr.unwrap();
1486                            if b"name" == key.into_inner() {
1487                                arch = Arch::from_str(ustr::get_str(&value)).unwrap_or_else(|e| {
1488                                    panic!("Failed parse Arch {} -- {e}", ustr::get_str(&value))
1489                                });
1490                                assert!(arch == Arch::Avr);
1491                            } else {
1492                                panic!("Failed parse Arch -- no name value");
1493                            }
1494                        }
1495                    }
1496                    QName(b"Instruction") => {
1497                        // start of a new instruction
1498                        curr_instruction = Instruction::default();
1499                        curr_instruction.arch = arch;
1500
1501                        for attr in e.attributes() {
1502                            let Attribute { key, value } = attr.unwrap();
1503                            match ustr::get_str(key.into_inner()) {
1504                                "name" => {
1505                                    let name = ustr::get_str(&value);
1506                                    curr_instruction.name = name.to_ascii_lowercase();
1507                                }
1508                                "summary" => {
1509                                    ustr::get_str(&value).clone_into(&mut curr_instruction.summary);
1510                                }
1511                                _ => {}
1512                            }
1513                        }
1514                    }
1515                    // Versions are defined a per-instruction form basis
1516                    QName(b"Version") => {
1517                        for attr in e.attributes() {
1518                            let Attribute { key, value } = attr.unwrap();
1519                            if "value" == ustr::get_str(key.into_inner()) {
1520                                curr_version = Some(ustr::get_str(&value).to_string());
1521                            }
1522                        }
1523                    }
1524                    QName(b"InstructionForm") => {
1525                        assert!(curr_version.is_some());
1526                        // new instruction form
1527                        curr_instruction_form = InstructionForm::default();
1528                        curr_instruction_form.avr_version.clone_from(&curr_version);
1529
1530                        // iterate over the attributes
1531                        for attr in e.attributes() {
1532                            let Attribute { key, value } = attr.unwrap();
1533                            match ustr::get_str(key.into_inner()) {
1534                                "mnemonic" => {
1535                                    curr_instruction_form.avr_mneumonic =
1536                                        Some(ustr::get_str(&value).to_owned());
1537                                }
1538                                "summary" => {
1539                                    curr_instruction_form.avr_summary =
1540                                        Some(ustr::get_str(&value).to_owned());
1541                                }
1542                                _ => {}
1543                            }
1544                        }
1545                    }
1546                    // NOTE: Intentionally leaving out encoding nibbles unless that's desired...
1547                    // QName(b"Encoding") => {}
1548                    _ => {} // unknown event
1549                }
1550            }
1551            Ok(Event::Empty(ref e)) => {
1552                match e.name() {
1553                    QName(b"Operand") => {
1554                        for attr in e.attributes() {
1555                            let Attribute { key, value } = attr.unwrap();
1556                            if key.into_inner() == b"type" {
1557                                let val = ustr::get_str(&value);
1558                                for oper in val.split(',') {
1559                                    if oper.is_empty() {
1560                                        continue;
1561                                    }
1562                                    let Ok(type_) = OperandType::from_str(oper) else {
1563                                        return Err(anyhow!(
1564                                            "Unknown value for operand type -- Variant: {}",
1565                                            ustr::get_str(&value)
1566                                        ));
1567                                    };
1568                                    curr_instruction_form.operands.push(Operand {
1569                                        type_,
1570                                        input: None,
1571                                        output: None,
1572                                        extended_size: None,
1573                                    });
1574                                }
1575                            }
1576                        }
1577                    }
1578                    // Status register values
1579                    QName(b"I") => {
1580                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.i = val);
1581                    }
1582                    QName(b"T") => {
1583                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.t = val);
1584                    }
1585                    QName(b"H") => {
1586                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.h = val);
1587                    }
1588                    QName(b"S") => {
1589                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.s = val);
1590                    }
1591                    QName(b"V") => {
1592                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.v = val);
1593                    }
1594                    QName(b"Z") => {
1595                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.z = val);
1596                    }
1597                    QName(b"C") => {
1598                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.c = val);
1599                    }
1600                    QName(b"N") => {
1601                        process_sreg_value(e, &mut curr_instruction_form, |sreg, val| sreg.n = val);
1602                    }
1603                    // Clocks
1604                    QName(b"AVRe") => {
1605                        process_clock_value(e, &mut curr_instruction_form, |timing, val| {
1606                            timing.avre = val;
1607                        });
1608                    }
1609                    QName(b"AVRxm") => {
1610                        process_clock_value(e, &mut curr_instruction_form, |timing, val| {
1611                            timing.avrxm = val;
1612                        });
1613                    }
1614                    QName(b"AVRxt") => {
1615                        process_clock_value(e, &mut curr_instruction_form, |timing, val| {
1616                            timing.avrxt = val;
1617                        });
1618                    }
1619                    QName(b"AVRrc") => {
1620                        process_clock_value(e, &mut curr_instruction_form, |timing, val| {
1621                            timing.avrrc = val;
1622                        });
1623                    }
1624                    _ => {} // unknown event
1625                }
1626            }
1627            // end event
1628            Ok(Event::End(ref e)) => {
1629                match e.name() {
1630                    QName(b"Instruction") => {
1631                        // finish instruction
1632                        assert!(curr_instruction.arch != Arch::None);
1633                        instructions_map
1634                            .insert(curr_instruction.name.clone(), curr_instruction.clone());
1635                        curr_version = None;
1636                    }
1637                    QName(b"InstructionForm") => {
1638                        curr_instruction.push_form(curr_instruction_form.clone());
1639                    }
1640                    _ => {} // unknown event
1641                }
1642            }
1643            Ok(Event::Eof) => break,
1644            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
1645            _ => {} // rest of events that we don't consider
1646        }
1647    }
1648
1649    Ok(instructions_map.into_values().collect())
1650}
1651
1652/// Parse the provided XML contents and return a vector of all the registers based on that.
1653/// If parsing fails, the appropriate error will be returned instead.
1654///
1655/// Current function assumes that the XML file is already read and that it's been given a reference
1656/// to its contents (`&str`).
1657///
1658/// # Errors
1659///
1660/// This function is highly specialized to parse a handful of files and will panic or return
1661/// `Err` for most mal-formed/unexpected inputs
1662///
1663/// # Panics
1664///
1665/// This function is highly specialized to parse a handful of files and will panic or return
1666/// `Err` for most mal-formed/unexpected inputs
1667pub fn populate_registers(xml_contents: &str) -> Result<Vec<Register>> {
1668    let mut registers_map = HashMap::<String, Register>::new();
1669
1670    // iterate through the XML
1671    let mut reader = Reader::from_str(xml_contents);
1672
1673    // ref to the register that's currently under construction
1674    let mut curr_register = Register::default();
1675    let mut curr_bit_flag = RegisterBitInfo::default();
1676    let mut arch: Arch = Arch::None;
1677
1678    loop {
1679        match reader.read_event() {
1680            // start event
1681            Ok(Event::Start(ref e)) => {
1682                match e.name() {
1683                    QName(b"InstructionSet") => {
1684                        for attr in e.attributes() {
1685                            let Attribute { key, value } = attr.unwrap();
1686                            if b"name" == key.into_inner() {
1687                                arch = Arch::from_str(ustr::get_str(&value)).unwrap_or_else(|e| {
1688                                    panic!(
1689                                        "Unexpected Arch variant {} -- {e}",
1690                                        ustr::get_str(&value)
1691                                    )
1692                                });
1693                            }
1694                        }
1695                    }
1696                    QName(b"Register") => {
1697                        // start of a new register
1698                        curr_register = Register::default();
1699                        curr_register.arch = arch;
1700
1701                        // iterate over the attributes
1702                        for attr in e.attributes() {
1703                            let Attribute { key, value } = attr.unwrap();
1704                            match key.into_inner() {
1705                                b"name" => {
1706                                    let name_ = String::from(ustr::get_str(&value));
1707                                    curr_register.name = name_.to_ascii_lowercase();
1708                                }
1709                                b"description" => {
1710                                    curr_register.description =
1711                                        Some(String::from(ustr::get_str(&value)));
1712                                }
1713                                b"type" => {
1714                                    curr_register.reg_type =
1715                                        RegisterType::from_str(ustr::get_str(&value))
1716                                            .map_or(None, |reg| Some(reg));
1717                                }
1718                                b"width" => {
1719                                    curr_register.width =
1720                                        RegisterWidth::from_str(ustr::get_str(&value))
1721                                            .map_or(None, |width| Some(width));
1722                                }
1723                                _ => {}
1724                            }
1725                        }
1726                    }
1727                    // Actual flag bit info
1728                    QName(b"Flag") => {
1729                        curr_bit_flag = RegisterBitInfo::default();
1730
1731                        for attr in e.attributes() {
1732                            let Attribute { key, value } = attr.unwrap();
1733                            match key.into_inner() {
1734                                b"bit" => {
1735                                    curr_bit_flag.bit =
1736                                        ustr::get_str(&value).parse::<u32>().unwrap();
1737                                }
1738                                b"label" => {
1739                                    curr_bit_flag.label = String::from(ustr::get_str(&value));
1740                                }
1741                                b"description" => {
1742                                    curr_bit_flag.description = String::from(ustr::get_str(&value));
1743                                }
1744                                b"pae" => {
1745                                    curr_bit_flag.pae = String::from(ustr::get_str(&value));
1746                                }
1747                                b"longmode" => {
1748                                    curr_bit_flag.long_mode = String::from(ustr::get_str(&value));
1749                                }
1750                                _ => {}
1751                            }
1752                        }
1753                    }
1754                    _ => {} // unknown event
1755                }
1756            }
1757            // end event
1758            Ok(Event::End(ref e)) => {
1759                match e.name() {
1760                    QName(b"Register") => {
1761                        // finish register
1762                        assert!(curr_register.arch != Arch::None);
1763                        registers_map.insert(curr_register.name.clone(), curr_register.clone());
1764                    }
1765                    QName(b"Flag") => {
1766                        curr_register.push_flag(curr_bit_flag.clone());
1767                    }
1768                    _ => {} // unknown event
1769                }
1770            }
1771            Ok(Event::Eof) => break,
1772            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
1773            _ => {} // rest of events that we don't consider
1774        }
1775    }
1776
1777    // TODO: Add to URL fields here for x86/x86-64?
1778    // https://wiki.osdev.org/CPU_Registers_x86 and https://wiki.osdev.org/CPU_Registers_x86-64
1779    // are less straightforward compared to the instruction set site
1780
1781    Ok(registers_map.into_values().collect())
1782}
1783
1784pub fn populate_name_to_register_map(
1785    arch: Arch,
1786    registers: &Vec<Register>,
1787    names_to_registers: &mut NameToRegisterMap,
1788) {
1789    for register in registers {
1790        for name in &register.get_associated_names() {
1791            names_to_registers.insert((arch, (*name).to_string()), register.clone());
1792        }
1793    }
1794}
1795
1796/// Parse the provided XML contents and return a vector of all the directives based on that.
1797/// If parsing fails, the appropriate error will be returned instead.
1798///
1799/// Current function assumes that the XML file is already read and that it's been given a reference
1800/// to its contents (`&str`).
1801///
1802/// # Errors
1803///
1804/// This function is highly specialized to parse a handful of files and will panic or return
1805/// `Err` for most mal-formed/unexpected inputs
1806///
1807/// # Panics
1808///
1809/// This function is highly specialized to parse a handful of files and will panic or return
1810/// `Err` for most mal-formed/unexpected inputs
1811pub fn populate_masm_nasm_fasm_mars_directives(xml_contents: &str) -> Result<Vec<Directive>> {
1812    let mut directives_map = HashMap::<String, Directive>::new();
1813
1814    // iterate through the XML
1815    let mut reader = Reader::from_str(xml_contents);
1816
1817    // ref to the assembler directive that's currently under construction
1818    let mut curr_directive = Directive::default();
1819    let mut in_desc = false;
1820
1821    loop {
1822        match reader.read_event() {
1823            // start event
1824            Ok(Event::Start(ref e)) => {
1825                match e.name() {
1826                    QName(b"directive") => {
1827                        // start of a new directive
1828                        curr_directive = Directive::default();
1829
1830                        // iterate over the attributes
1831                        for attr in e.attributes() {
1832                            let Attribute { key, value } = attr.unwrap();
1833                            match key.into_inner() {
1834                                b"name" => {
1835                                    let name = ustr::get_str(&value);
1836                                    curr_directive.name = name.to_ascii_lowercase();
1837                                }
1838                                b"tool" => {
1839                                    let assembler = Assembler::from_str(ustr::get_str(&value))?;
1840                                    curr_directive.assembler = assembler;
1841                                }
1842                                _ => {}
1843                            }
1844                        }
1845                    }
1846                    QName(b"description") => {
1847                        in_desc = true;
1848                    }
1849                    _ => {} // unknown event
1850                }
1851            }
1852            Ok(Event::Text(ref txt)) => {
1853                if in_desc {
1854                    ustr::get_str(txt)
1855                        .trim_ascii()
1856                        .clone_into(&mut curr_directive.description);
1857                }
1858            }
1859            // end event
1860            Ok(Event::End(ref e)) => {
1861                if QName(b"directive") == e.name() {
1862                    directives_map.insert(curr_directive.name.clone(), curr_directive.clone());
1863                } else if QName(b"description") == e.name() {
1864                    in_desc = false;
1865                }
1866            }
1867            Ok(Event::Eof) => break,
1868            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
1869            _ => {} // rest of events that we don't consider
1870        }
1871    }
1872
1873    // Since directive entries have their assembler labeled on a per-instance basis,
1874    // we check to make sure all of them have been assigned correctly
1875    for directive in directives_map.values() {
1876        assert_ne!(directive.assembler, Assembler::None);
1877    }
1878
1879    Ok(directives_map.into_values().collect())
1880}
1881
1882/// Parse the provided XML contents and return a vector of all the directives based on that.
1883/// If parsing fails, the appropriate error will be returned instead.
1884///
1885/// Current function assumes that the XML file is already read and that it's been given a reference
1886/// to its contents (`&str`).
1887///
1888/// # Errors
1889///
1890/// This function is highly specialized to parse a handful of files and will panic or return
1891/// `Err` for most mal-formed/unexpected inputs
1892///
1893/// # Panics
1894///
1895/// This function is highly specialized to parse a handful of files and will panic or return
1896/// `Err` for most mal-formed/unexpected inputs
1897pub fn populate_gas_directives(xml_contents: &str) -> Result<Vec<Directive>> {
1898    let mut directives_map = HashMap::<String, Directive>::new();
1899
1900    // iterate through the XML
1901    let mut reader = Reader::from_str(xml_contents);
1902
1903    // ref to the assembler directive that's currently under construction
1904    let mut curr_directive = Directive::default();
1905    let mut assembler = Assembler::None;
1906
1907    loop {
1908        match reader.read_event() {
1909            // start event
1910            Ok(Event::Start(ref e)) => {
1911                match e.name() {
1912                    QName(b"Assembler") => {
1913                        for attr in e.attributes() {
1914                            let Attribute { key, value } = attr.unwrap();
1915                            if b"name" == key.into_inner() {
1916                                assembler = Assembler::from_str(ustr::get_str(&value)).unwrap();
1917                            }
1918                        }
1919                    }
1920                    QName(b"Directive") => {
1921                        // start of a new directive
1922                        curr_directive = Directive::default();
1923                        curr_directive.assembler = assembler;
1924
1925                        // iterate over the attributes
1926                        for attr in e.attributes() {
1927                            let Attribute { key, value } = attr.unwrap();
1928                            match key.into_inner() {
1929                                b"name" => {
1930                                    let name = ustr::get_str(&value);
1931                                    curr_directive.name = name.to_ascii_lowercase();
1932                                }
1933                                b"md_description" => {
1934                                    let description = ustr::get_str(&value);
1935                                    curr_directive.description =
1936                                        unescape(description).unwrap().to_string();
1937                                }
1938                                b"deprecated" => {
1939                                    curr_directive.deprecated =
1940                                        FromStr::from_str(ustr::get_str(&value)).unwrap();
1941                                }
1942                                b"url_fragment" => {
1943                                    curr_directive.url = Some(format!(
1944                                        "https://sourceware.org/binutils/docs-2.41/as/{}.html",
1945                                        ustr::get_str(&value)
1946                                    ));
1947                                }
1948                                _ => {}
1949                            }
1950                        }
1951                    }
1952                    QName(b"Signature") => {
1953                        for attr in e.attributes() {
1954                            let Attribute { key, value } = attr.unwrap();
1955                            if b"sig" == key.into_inner() {
1956                                let sig = ustr::get_str(&value);
1957                                curr_directive
1958                                    .signatures
1959                                    .push(unescape(sig).unwrap().to_string());
1960                            }
1961                        }
1962                    }
1963                    _ => {} // unknown event
1964                }
1965            }
1966            // end event
1967            Ok(Event::End(ref e)) => {
1968                if QName(b"Directive") == e.name() {
1969                    // finish directive
1970                    directives_map.insert(curr_directive.name.clone(), curr_directive.clone());
1971                }
1972            }
1973            Ok(Event::Eof) => break,
1974            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
1975            _ => {} // rest of events that we don't consider
1976        }
1977    }
1978
1979    // Since directive entries have their assembler labeled on a per-instance basis,
1980    // we check to make sure all of them have been assigned correctly
1981    for directive in directives_map.values() {
1982        assert_ne!(directive.assembler, Assembler::None);
1983    }
1984
1985    Ok(directives_map.into_values().collect())
1986}
1987
1988/// Parse the provided XML contents and return a vector of all the directives based on that.
1989/// If parsing fails, the appropriate error will be returned instead.
1990///
1991/// Current function assumes that the XML file is already read and that it's been given a reference
1992/// to its contents (`&str`).
1993///
1994/// # Errors
1995///
1996/// This function is highly specialized to parse a handful of files and will panic or return
1997/// `Err` for most mal-formed/unexpected inputs
1998///
1999/// # Panics
2000///
2001/// This function is highly specialized to parse a handful of files and will panic or return
2002/// `Err` for most mal-formed/unexpected inputs
2003pub fn populate_avr_directives(xml_contents: &str) -> Result<Vec<Directive>> {
2004    let mut directives_map = HashMap::<String, Directive>::new();
2005
2006    // iterate through the XML
2007    let mut reader = Reader::from_str(xml_contents);
2008
2009    // ref to the assembler directive that's currently under construction
2010    let mut curr_directive = Directive::default();
2011    let mut assembler = Assembler::None;
2012
2013    loop {
2014        match reader.read_event() {
2015            // start event
2016            Ok(Event::Start(ref e)) => {
2017                match e.name() {
2018                    QName(b"Assembler") => {
2019                        for attr in e.attributes() {
2020                            let Attribute { key, value } = attr.unwrap();
2021                            if b"name" == key.into_inner() {
2022                                assembler = Assembler::from_str(ustr::get_str(&value)).unwrap();
2023                            }
2024                        }
2025                    }
2026                    QName(b"Directive") => {
2027                        // start of a new directive
2028                        curr_directive = Directive::default();
2029                        curr_directive.assembler = assembler;
2030
2031                        // iterate over the attributes
2032                        for attr in e.attributes() {
2033                            let Attribute { key, value } = attr.unwrap();
2034                            match key.into_inner() {
2035                                b"name" => {
2036                                    let name = ustr::get_str(&value);
2037                                    curr_directive.name = name.to_ascii_lowercase();
2038                                }
2039                                b"description" => {
2040                                    let description = ustr::get_str(&value);
2041                                    curr_directive.description =
2042                                        unescape(description).unwrap().to_string();
2043                                }
2044                                _ => {}
2045                            }
2046                        }
2047                    }
2048                    QName(b"Signature") => {
2049                        for attr in e.attributes() {
2050                            let Attribute { key, value } = attr.unwrap();
2051                            if b"sig" == key.into_inner() {
2052                                let sig = ustr::get_str(&value);
2053                                curr_directive
2054                                    .signatures
2055                                    .push(unescape(sig).unwrap().to_string());
2056                            }
2057                        }
2058                    }
2059                    _ => {} // unknown event
2060                }
2061            }
2062            // end event
2063            Ok(Event::End(ref e)) => {
2064                if QName(b"Directive") == e.name() {
2065                    // finish directive
2066                    directives_map.insert(curr_directive.name.clone(), curr_directive.clone());
2067                }
2068            }
2069            Ok(Event::Eof) => break,
2070            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
2071            _ => {} // rest of events that we don't consider
2072        }
2073    }
2074
2075    // Since directive entries have their assembler labeled on a per-instance basis,
2076    // we check to make sure all of them have been assigned correctly
2077    for directive in directives_map.values() {
2078        assert_ne!(directive.assembler, Assembler::None);
2079    }
2080
2081    Ok(directives_map.into_values().collect())
2082}
2083
2084/// Parse the provided HTML contents and return a vector of all the directives based on that.
2085/// If parsing fails, the appropriate error will be returned instead.
2086///
2087/// <https://cc65.github.io/doc/ca65.html>
2088///
2089/// # Errors
2090///
2091/// This function is highly specialized to parse a single file and will panic or return
2092/// `Err` for most mal-formed/unexpected inputs
2093///
2094/// # Panics
2095///
2096/// This function is highly specialized to parse a single file and will panic or return
2097/// `Err` for most mal-formed/unexpected inputs
2098pub fn populate_ca65_directives(html_conts: &str) -> Result<Vec<Directive>> {
2099    let eat_lines = |lines: &mut Peekable<Lines<'_>>, empty: bool| {
2100        while let Some(line) = lines.peek() {
2101            if empty != line.is_empty() {
2102                break;
2103            }
2104            _ = lines.next().unwrap();
2105        }
2106    };
2107    let name_regex = Regex::new(r"<CODE>(?<name>.+)</CODE>").unwrap();
2108    let url_regex =
2109        Regex::new(r#"^<H2><A NAME=".+"></A> <A NAME="(?<fragment>[a-z, A-Z, 0-9, .]+)">"#)
2110            .unwrap();
2111    let mut directives = Vec::new();
2112    let start = {
2113        let start_marker = r##"<H2><A NAME="pseudo-variables"></A> <A NAME="s9">9.</A> <A HREF="#toc9">Pseudo variables</A></H2>"##;
2114        let section_start = html_conts.find(start_marker).unwrap();
2115        section_start + start_marker.len() + 1 // + 1 for '\n'
2116    };
2117    let mut lines = html_conts[start..].lines().peekable();
2118    eat_lines(&mut lines, true);
2119    _ = lines.next().unwrap(); // Extra info on pseudo variables
2120    _ = lines.next().unwrap();
2121    eat_lines(&mut lines, true);
2122    'outer: loop {
2123        // Consume lines until we find a section header
2124        loop {
2125            let Some(next) = lines.peek() else {
2126                break 'outer;
2127            };
2128            let next = next.trim();
2129            if next.starts_with("<H2><A NAME=\".") || next.starts_with("<H2><A NAME=\"*") {
2130                break;
2131            }
2132            _ = lines.next().unwrap();
2133        }
2134
2135        let name_line = lines.next().unwrap();
2136        let name = {
2137            let Some(caps) = &name_regex.captures(name_line) else {
2138                // If this capture fails, we're at a section header rather than a subsection header
2139                // We don't care about section headers, since they don't document any information
2140                // about directives or anything else of importance. Just consume the lines and move on
2141                eat_lines(&mut lines, true);
2142                eat_lines(&mut lines, false);
2143                eat_lines(&mut lines, true);
2144                continue;
2145            };
2146            caps["name"].to_string()
2147        };
2148        let fragment = &url_regex.captures(name_line).unwrap()["fragment"];
2149        let url = format!("https://cc65.github.io/doc/ca65.html#{fragment}");
2150        assert_eq!(lines.next().unwrap().trim(), "</H2>");
2151        eat_lines(&mut lines, true);
2152        // get the description, remove anything inside carets
2153        let mut description = String::new();
2154        while !lines.peek().unwrap().is_empty() {
2155            let description_line = lines.next().unwrap();
2156            let len_before = description.len();
2157            let mut prev_idx = 0;
2158            for (i, c) in description_line.chars().enumerate() {
2159                match c {
2160                    '<' => {
2161                        #[allow(
2162                            clippy::sliced_string_as_bytes,
2163                            clippy::char_indices_as_byte_indices
2164                        )]
2165                        let bytes: Vec<u8> = description_line[prev_idx..i].as_bytes().to_vec();
2166                        let decoded = htmlentity::entity::decode(&bytes).to_string().unwrap();
2167                        description += &decoded;
2168                    }
2169                    '>' => prev_idx = i + 1,
2170                    _ => {}
2171                }
2172            }
2173            let line_len = description_line.len();
2174            // Not all lines end with a closing tag...
2175            if prev_idx < line_len - 1 {
2176                #[allow(clippy::sliced_string_as_bytes)]
2177                let bytes = description_line[prev_idx..description_line.len()].as_bytes();
2178                let decoded = htmlentity::entity::decode(bytes).to_string().unwrap();
2179                description += &decoded;
2180            }
2181            if description.len() != len_before {
2182                description.push(' ');
2183            }
2184        }
2185        let description = {
2186            while description.ends_with('\n') {
2187                _ = description.pop();
2188            }
2189            description.push('\n');
2190            description.trim().replace("  ", " ")
2191        };
2192        // Some entries cover two items, add both to the map
2193        if name.contains(',') {
2194            for alias in name.split(", ") {
2195                directives.push(Directive {
2196                    name: alias.trim().to_lowercase(),
2197                    signatures: Vec::new(),
2198                    description: description.clone(),
2199                    deprecated: false,
2200                    url: Some(url.clone()),
2201                    assembler: Assembler::Ca65,
2202                });
2203            }
2204        } else {
2205            directives.push(Directive {
2206                name: name.to_lowercase(),
2207                signatures: Vec::new(),
2208                description,
2209                deprecated: false,
2210                url: Some(url),
2211                assembler: Assembler::Ca65,
2212            });
2213        }
2214    }
2215
2216    Ok(directives)
2217}
2218
2219pub fn populate_name_to_directive_map(
2220    assem: Assembler,
2221    directives: &Vec<Directive>,
2222    names_to_directives: &mut NameToDirectiveMap,
2223) {
2224    for directive in directives {
2225        for name in &directive.get_associated_names() {
2226            names_to_directives.insert((assem, (*name).to_string()), directive.clone());
2227        }
2228    }
2229}
2230
2231fn get_docs_body(x86_online_docs: &str) -> Option<String> {
2232    // provide a URL example page
2233    // 1. If the cache refresh option is enabled or the cache doesn't exist, attempt to fetch the
2234    //    data, write it to the cache, and then use it
2235    // 2. Otherwise, attempt to read the data from the cache
2236    // 3. If invalid data is read in, attempt to remove the cache file
2237    let cache_refresh = args().any(|arg| arg.contains("--cache-refresh"));
2238    let mut x86_cache_path = match get_cache_dir() {
2239        Ok(cache_path) => Some(cache_path),
2240        Err(e) => {
2241            eprintln!("Failed to resolve the cache file path - Error: {e}.");
2242            None
2243        }
2244    };
2245
2246    // Attempt to append the cache file name to path and see if it is valid/ exists
2247    let cache_exists: bool;
2248    if let Some(mut path) = x86_cache_path {
2249        path.push("x86_instr_docs.html");
2250        cache_exists = matches!(path.try_exists(), Ok(true));
2251        x86_cache_path = Some(path);
2252    } else {
2253        cache_exists = false;
2254    }
2255
2256    let body = if cache_refresh || !cache_exists {
2257        match get_x86_docs_web(x86_online_docs) {
2258            Ok(docs) => {
2259                if let Some(ref path) = x86_cache_path {
2260                    set_x86_docs_cache(&docs, path);
2261                }
2262                docs
2263            }
2264            Err(e) => {
2265                eprintln!("Failed to fetch documentation from {x86_online_docs} - Error: {e}.");
2266                return None;
2267            }
2268        }
2269    } else if let Some(ref path) = x86_cache_path {
2270        match get_x86_docs_cache(path) {
2271            Ok(docs) => docs,
2272            Err(e) => {
2273                eprintln!(
2274                    "Failed to fetch documentation from the cache: {} - Error: {e}.",
2275                    path.display()
2276                );
2277                return None;
2278            }
2279        }
2280    } else {
2281        eprintln!("Failed to fetch documentation from the cache - Invalid path.");
2282        return None;
2283    };
2284
2285    // try to create the iterator to check if the data is valid
2286    // if the body produces an empty iterator, we attempt to clear the cache
2287    if body.split("<td>").skip(1).step_by(2).next().is_none() {
2288        eprintln!("Invalid docs contents.");
2289        if let Some(ref path) = x86_cache_path {
2290            eprintln!("Attempting to remove the cache file {}...", path.display());
2291            match std::fs::remove_file(path) {
2292                Ok(()) => {
2293                    eprintln!("Cache file removed.");
2294                }
2295                Err(e) => {
2296                    eprintln!("Failed to remove the cache file - Error: {e}.",);
2297                }
2298            }
2299        } else {
2300            eprintln!("Unable to clear the cache, invalid path.");
2301        }
2302        return None;
2303    }
2304
2305    Some(body)
2306}
2307
2308/// Searches for the asm-lsp cache directory.
2309///
2310/// - First checks for the `ASM_LSP_CACHE_DIR` environment variable. If this variable
2311///   is present and points to a valid directory, this path is returned.
2312/// - Otherwise, the function returns `~/.config/asm-lsp/`
2313///
2314/// # Errors
2315///
2316/// Returns `Err` if no directory can be found through `ASM_LSP_CACHE_DIR`, and
2317/// then no home directory can be found on the system
2318pub fn get_cache_dir() -> Result<PathBuf> {
2319    // first check if the appropriate environment variable is set
2320    if let Ok(path) = std::env::var("ASM_LSP_CACHE_DIR") {
2321        let path = PathBuf::from(path);
2322        // ensure the path is valid
2323        if path.is_dir() {
2324            return Ok(path);
2325        }
2326    }
2327
2328    // If the environment variable isn't set or gives an invalid path, grab the home directory and build off of that
2329    let mut x86_cache_path = home::home_dir().ok_or_else(|| anyhow!("Home directory not found"))?;
2330
2331    x86_cache_path.push(".cache");
2332    x86_cache_path.push("asm-lsp");
2333
2334    // create the ~/.cache/asm-lsp directory if it's not already there
2335    fs::create_dir_all(&x86_cache_path)?;
2336
2337    Ok(x86_cache_path)
2338}
2339
2340#[cfg(not(test))]
2341fn get_x86_docs_url() -> String {
2342    String::from("https://www.felixcloutier.com/x86/")
2343}
2344
2345#[cfg(test)]
2346fn get_x86_docs_url() -> String {
2347    String::from("http://127.0.0.1:8080/x86/")
2348}
2349
2350fn get_x86_docs_web(x86_online_docs: &str) -> Result<String> {
2351    println!("Fetching further documentation from the web -> {x86_online_docs}...");
2352    // grab the info from the web
2353    let contents = reqwest::blocking::get(x86_online_docs)?.text()?;
2354    Ok(contents)
2355}
2356
2357fn get_x86_docs_cache(x86_cache_path: &PathBuf) -> Result<String, std::io::Error> {
2358    println!(
2359        "Fetching html page containing further documentation, from the cache -> {}...",
2360        x86_cache_path.display()
2361    );
2362    fs::read_to_string(x86_cache_path)
2363}
2364
2365fn set_x86_docs_cache(contents: &str, x86_cache_path: &PathBuf) {
2366    println!("Writing to the cache file {}...", x86_cache_path.display());
2367    match fs::File::create(x86_cache_path) {
2368        Ok(mut cache_file) => {
2369            println!("Created the cache file {} .", x86_cache_path.display());
2370            match cache_file.write_all(contents.as_bytes()) {
2371                Ok(()) => {
2372                    println!("Populated the cache.");
2373                }
2374                Err(e) => {
2375                    eprintln!(
2376                        "Failed to write to the cache file {} - Error: {e}.",
2377                        x86_cache_path.display()
2378                    );
2379                }
2380            }
2381        }
2382        Err(e) => {
2383            eprintln!(
2384                "Failed to create the cache file {} - Error: {e}.",
2385                x86_cache_path.display()
2386            );
2387        }
2388    }
2389}
2390
2391#[cfg(test)]
2392mod tests {
2393    use mockito::ServerOpts;
2394
2395    use crate::parser::{get_cache_dir, populate_instructions};
2396    #[test]
2397    fn test_populate_instructions() {
2398        let mut server = mockito::Server::new_with_opts(ServerOpts {
2399            port: 8080,
2400            ..Default::default()
2401        });
2402
2403        let _ = server
2404            .mock("GET", "/x86/")
2405            .with_status(200)
2406            .with_header("content-type", "text/html")
2407            .with_body(include_str!(
2408                "../docs_store/instr_info_cache/x86_instr_docs.html"
2409            ))
2410            .create();
2411
2412        // Need to clear the cache file (if there is one)
2413        // to ensure a request is made for each test call
2414        let mut x86_cache_path = get_cache_dir().unwrap();
2415        x86_cache_path.push("x86_instr_docs.html");
2416        if x86_cache_path.is_file() {
2417            std::fs::remove_file(&x86_cache_path).unwrap();
2418        }
2419        let xml_conts_x86 = include_str!("../docs_store/opcodes/x86.xml");
2420        assert!(populate_instructions(xml_conts_x86).is_ok());
2421
2422        if x86_cache_path.is_file() {
2423            std::fs::remove_file(&x86_cache_path).unwrap();
2424        }
2425        let xml_conts_x86_64 = include_str!("../docs_store/opcodes/x86_64.xml");
2426        assert!(populate_instructions(xml_conts_x86_64).is_ok());
2427
2428        // Clean things up so we don't have an empty cache file
2429        if x86_cache_path.is_file() {
2430            std::fs::remove_file(&x86_cache_path).unwrap();
2431        }
2432    }
2433}