use std::collections::HashMap;
use tree_sitter;
use tree_sitter_integerbasic;
use crate::lang;
use crate::lang::{Navigate,Navigation};
use super::token_maps;
use log::{warn,error};
use crate::{STDRESULT,DYNERR};
pub struct Tokenizer
{
line: String,
tokenized_program: Vec<u8>,
tokenized_line: Vec<u8>,
tok_map: HashMap<&'static str,u8>,
detok_map: HashMap<u8,&'static str>,
config: super::settings::Settings
}
impl Navigate for Tokenizer
{
fn visit(&mut self,curs:&tree_sitter::TreeCursor) -> Result<Navigation,DYNERR>
{
if curs.node().kind()=="linenum" || curs.node().kind()=="integer" {
let cleaned = self.text(curs.node()).replace(" ","");
if let Ok(num) = i16::from_str_radix(&cleaned,10) {
let bytes = i16::to_le_bytes(num);
if let Some(parent) = curs.node().parent() {
if parent.kind()!="line" {
let header = i16::to_string(&num).as_bytes()[0] + 128;
self.tokenized_line.push(header);
}
}
self.tokenized_line.push(bytes[0]);
self.tokenized_line.push(bytes[1]);
return Ok(Navigation::GotoSibling);
}
error!("number node did not parse as i16");
self.line = "ERR".to_string();
return Ok(Navigation::Exit);
}
if let Some(tok) = self.tok_map.get(curs.node().kind()) {
self.tokenized_line.push(*tok);
if curs.node().kind().starts_with("com_") {
warn!("{} is an immediate mode only command",self.text(curs.node()).to_ascii_uppercase());
}
return Ok(Navigation::GotoSibling);
}
if curs.node().kind()=="str_name" || curs.node().kind()=="int_name" {
let cleaned = self.text(curs.node()).to_uppercase().replace(" ","").as_bytes().to_vec();
let mut neg = cleaned.iter().map(|b| {
if *b==36 {
64 } else {
b+128
}
}).collect();
self.tokenized_line.append(&mut neg);
return Ok(Navigation::GotoSibling);
}
if curs.node().kind()=="string" {
let mut neg: Vec<u8> = vec![0x28];
let txt = Self::stringlike_node_to_bytes(&self.text(curs.node()), false);
neg.append(&mut txt[1..txt.len()-1].to_vec());
neg.push(0x29);
self.tokenized_line.append(&mut neg);
return Ok(Navigation::GotoSibling);
}
if curs.node().kind()=="comment_text" {
let mut neg = Self::stringlike_node_to_bytes(&self.text(curs.node()), false);
self.tokenized_line.append(&mut neg);
return Ok(Navigation::GotoSibling);
}
if curs.node().named_child_count()==0 {
self.tokenized_line.append(&mut self.text(curs.node()).to_uppercase().replace(" ","").as_bytes().to_vec());
return Ok(Navigation::GotoSibling);
}
return Ok(Navigation::GotoChild);
}
}
impl Tokenizer
{
pub fn new() -> Self
{
Self {
line: String::new(),
tokenized_line: Vec::<u8>::new(),
tokenized_program: Vec::<u8>::new(),
tok_map: HashMap::from(token_maps::TOK_MAP),
detok_map: HashMap::from(token_maps::DETOK_MAP),
config: super::settings::Settings::new()
}
}
pub fn set_config(&mut self,config: super::settings::Settings) {
self.config = config;
}
fn text(&self,node: tree_sitter::Node) -> String {
let rng = std::ops::Range {start: node.range().start_point.column, end: node.range().end_point.column};
String::from(&self.line[rng])
}
fn stringlike_node_to_bytes(txt: &str,trim: bool) -> Vec<u8> {
let trimmed = match trim { true => txt.trim_start().to_string(), false => txt.to_string() };
return crate::escaped_ascii_to_bytes(&trimmed, true, true);
}
fn tokenize_line(&mut self,parser: &mut tree_sitter::Parser) -> STDRESULT {
self.tokenized_line = Vec::new();
let tree = parser.parse(&self.line,None).expect("Error parsing file");
self.walk(&tree)?;
if self.line=="ERR" {
return Err(Box::new(lang::Error::Syntax));
}
if self.tokenized_line.len()>126 {
error!("integer BASIC line too long");
return Err(Box::new(lang::Error::Syntax));
}
self.tokenized_line.insert(0,self.tokenized_line.len() as u8 +2);
self.tokenized_line.push(1);
Ok(())
}
pub fn tokenize(&mut self,program: String) -> Result<Vec<u8>,DYNERR> {
self.tokenized_program = Vec::new();
let mut parser = tree_sitter::Parser::new();
parser.set_language(&tree_sitter_integerbasic::LANGUAGE.into()).expect("error loading integer grammar");
for line in program.lines() {
if line.trim().len()==0 {
continue;
}
self.line = String::from(line) + "\n";
self.tokenize_line(&mut parser)?;
self.tokenized_program.append(&mut self.tokenized_line);
}
Ok(self.tokenized_program.clone())
}
pub fn detokenize(&self,img: &[u8]) -> Result<String,DYNERR> {
const OPEN_QUOTE: u8 = 0x28;
const CLOSE_QUOTE: u8 = 0x29;
const REM_TOK: u8 = 93;
const EOL: u8 = 0x01;
let mut addr = 0;
let mut code = String::new();
let mut line_count = 0;
while addr < 65536 && addr+2<img.len() && line_count < self.config.detokenizer.max_lines {
addr += 1; let line_num: u16 = img[addr] as u16 + img[addr+1] as u16*256;
code += &(u16::to_string(&line_num) + " ");
addr += 2;
let mut escaped: String;
for rep in 0..=self.config.detokenizer.max_line_length {
if rep==self.config.detokenizer.max_line_length {
error!("integer BASIC line is too long");
return Err(Box::new(lang::Error::Syntax));
}
if addr >= img.len() {
error!("program ended while processing line");
return Err(Box::new(lang::Error::Detokenization));
}
if img[addr]==EOL {
line_count += 1;
code += "\n";
addr += 1;
break;
} else if img[addr] == OPEN_QUOTE {
code += "\"";
(escaped,addr) = super::bytes_to_escaped_string_ex(&img, addr+1, &self.config.detokenizer.escapes, &[CLOSE_QUOTE,EOL]);
code += &escaped;
if img[addr] == CLOSE_QUOTE {
code += "\"";
addr += 1;
}
} else if img[addr] == REM_TOK {
if !code.ends_with(" ") {
code += " ";
}
code += "REM"; (escaped,addr) = super::bytes_to_escaped_string_ex(&img, addr+1, &self.config.detokenizer.escapes, &[EOL]);
code += &escaped;
} else if img[addr]<128 {
if let Some(tok) = self.detok_map.get(&img[addr]) {
if tok.len()>1 && *tok!="<>" && !code.ends_with(" ") {
code += " ";
}
code += &tok.to_uppercase();
if tok.len()>1 && *tok!="<>" && !tok.ends_with("(") && !tok.ends_with("=") {
code += " ";
}
addr += 1;
} else {
error!("unrecognized integer BASIC token {} encountered",img[addr]);
return Err(Box::new(lang::Error::Syntax));
}
} else if img[addr]>=176 && img[addr]<=185 {
if addr+2 >= img.len() {
error!("program ended while processing integer");
return Err(Box::new(lang::Error::Detokenization));
}
code += &u16::to_string(&u16::from_le_bytes([img[addr+1],img[addr+2]]));
addr += 3;
} else {
while img[addr]>=128 {
code += &String::from_utf8(vec![img[addr]-128]).expect("expected negative ASCII was not found");
addr += 1;
if addr >= img.len() {
error!("program ended while processing variable name");
return Err(Box::new(lang::Error::Detokenization));
}
}
}
}
}
return Ok(code);
}
pub fn detokenize_from_ram(&self,img: &[u8]) -> Result<String,DYNERR> {
if img.len() < 0x8000 {
error!("RAM image too small {}",img.len());
return Err(Box::new(lang::Error::Detokenization));
}
let addr = img[202] as usize + img[203] as usize * 256;
let himem = img[76] as usize + img[77] as usize * 256;
if addr >= img.len() || himem > img.len() || addr >= himem {
error!("program pointers or RAM image bounds are invalid");
return Err(Box::new(lang::Error::Detokenization));
}
self.detokenize(&img[addr..himem])
}
}