hence/hence/src/lib/lexer.rs

347 lines
9.8 KiB
Rust

use anyhow::{bail, Result};
use itertools::Itertools;
use crate::assembler;
#[derive(Debug, PartialEq, Eq)]
pub enum Token {
Comment(String),
CharLiteral(String),
StringLiteral(String),
MacroLiteral(String),
Literal(String),
Number(String),
Comma,
Colon,
LParen,
RParen,
Not,
And,
Nand,
Or,
Nor,
Xor,
Xnor,
Lsh,
Rsh,
Add,
Sub,
Mul,
Div,
Pow,
Cmp,
Eq,
Neq,
Lt,
Gt,
Leq,
Geq,
Bol,
Inv,
Rnd,
Newline(String),
Whitespace(String),
}
impl assembler::ToCode for Token {
fn to_code(&self) -> String {
match self {
Token::Comment(x) => format!(";{}", x),
Token::CharLiteral(x) => format!("'{}'", x),
Token::StringLiteral(x) => format!("\"{}\"", x),
Token::MacroLiteral(x) => x.clone(),
Token::Literal(x) => x.clone(),
Token::Number(x) => x.clone(),
Token::Comma => ",".to_string(),
Token::Colon => ":".to_string(),
Token::LParen => "(".to_string(),
Token::RParen => ")".to_string(),
Token::Not => "~".to_string(),
Token::And => "&".to_string(),
Token::Nand => "~&".to_string(),
Token::Or => "|".to_string(),
Token::Nor => "~|".to_string(),
Token::Xor => "^".to_string(),
Token::Xnor => "~^".to_string(),
Token::Lsh => "<<".to_string(),
Token::Rsh => ">>".to_string(),
Token::Add => "+".to_string(),
Token::Sub => "-".to_string(),
Token::Mul => "*".to_string(),
Token::Div => "/".to_string(),
Token::Pow => "**".to_string(),
Token::Cmp => "<=>".to_string(),
Token::Eq => "==".to_string(),
Token::Neq => "!=".to_string(),
Token::Lt => "<".to_string(),
Token::Gt => ">".to_string(),
Token::Leq => "<=".to_string(),
Token::Geq => ">=".to_string(),
Token::Bol => "!!".to_string(),
Token::Inv => "!".to_string(),
Token::Rnd => "?".to_string(),
Token::Newline(x) | Token::Whitespace(x) => x.clone(),
}
}
}
pub fn lex(source: String) -> Result<Vec<Token>> {
let mut chars = source.chars().peekable();
let mut tokens: Vec<Token> = vec![];
while let Some(&ch) = chars.peek() {
tokens.push(match ch {
';' => {
chars.next();
chars.next_if(|c| *c == ';');
Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
}
'@' => {
chars.next();
Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
}
'#' => {
chars.next();
Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
}
'\'' => {
chars.next();
Token::CharLiteral(chars.by_ref().take_while(|c| *c != '\'').collect())
}
'"' => {
chars.next();
Token::StringLiteral(chars.by_ref().take_while(|c| *c != '"').collect())
}
'.' => {
chars.next();
Token::MacroLiteral(format!(
".{}",
chars
.peeking_take_while(|c| c.is_alphabetic() || c.is_numeric() || *c == '_')
.collect::<String>()
))
}
ch if ch.is_alphabetic() => {
let name: String = chars
.peeking_take_while(|c| c.is_alphabetic() || c.is_numeric() || *c == '_')
.collect();
Token::Literal(name)
}
ch if ch.is_numeric() => Token::Number(
chars
.peeking_take_while(|c| c.is_alphanumeric() || *c == '_')
.collect(),
),
',' => {
chars.next();
Token::Comma
}
':' => {
chars.next();
Token::Colon
}
'(' => {
chars.next();
Token::LParen
}
')' => {
chars.next();
Token::RParen
}
'~' => {
chars.next();
if let Some(c) = chars.peek() {
match c {
'&' => {
chars.next();
Token::Nand
}
'|' => {
chars.next();
Token::Nor
}
'^' => {
chars.next();
Token::Xnor
}
_ => Token::Not,
}
} else {
Token::Not
}
}
'&' => {
chars.next();
Token::And
}
'|' => {
chars.next();
Token::Or
}
'^' => {
chars.next();
Token::Xor
}
'<' => {
chars.next();
match chars.peek() {
Some('<') => {
chars.next();
Token::Lsh
}
Some('=') => {
chars.next();
match chars.peek() {
Some('>') => {
chars.next();
Token::Cmp
}
_ => Token::Leq,
}
}
_ => Token::Lt,
}
}
'>' => {
chars.next();
match chars.peek() {
Some('>') => {
chars.next();
Token::Rsh
}
Some('=') => {
chars.next();
Token::Geq
}
_ => Token::Gt,
}
}
'+' => {
chars.next();
Token::Add
}
'-' => {
chars.next();
Token::Sub
}
'*' => {
chars.next();
if let Some('*') = chars.peek() {
chars.next();
Token::Pow
} else {
Token::Mul
}
}
'/' => {
chars.next();
Token::Div
}
'=' => {
chars.next();
if let Some('=') = chars.peek() {
chars.next();
}
Token::Eq
}
'!' => {
chars.next();
match chars.peek() {
Some('!') => {
chars.next();
Token::Bol
}
Some('=') => {
chars.next();
Token::Neq
}
_ => Token::Inv,
}
}
'?' => {
chars.next();
Token::Rnd
}
'\n' => Token::Newline(chars.peeking_take_while(|c| *c == '\n').collect()),
ch if ch.is_whitespace() => Token::Whitespace(
chars
.peeking_take_while(|c| c.is_whitespace() && *c != '\n')
.collect(),
),
_ => bail!("Unexpected token: {}", ch),
});
}
Ok(tokens)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::assembler::ToCode;
#[test]
fn test_token_to_assembly() {
assert_eq!(
Token::Comment(" \"main function\" like definition macro".to_string()).to_code(),
"; \"main function\" like definition macro".to_string()
);
assert_eq!(
Token::CharLiteral("\\n".to_string()).to_code(),
"'\\n'".to_string()
);
assert_eq!(
Token::MacroLiteral("xyz".to_string()).to_code(),
"xyz".to_string()
);
assert_eq!(
Token::Literal("xkcd".to_string()).to_code(),
"xkcd".to_string()
);
assert_eq!(Token::Newline("\n".to_string()).to_code(), "\n".to_string());
assert_eq!(
Token::Whitespace(" ".to_string()).to_code(),
" ".to_string()
);
}
#[test]
fn test_lex() -> Result<()> {
assert_eq!(
lex(";; test".to_string())?,
vec![Token::Comment(" test".to_string())]
);
assert_eq!(
lex("@ test".to_string())?,
vec![Token::Comment(" test".to_string())]
);
assert_eq!(
lex("# test".to_string())?,
vec![Token::Comment(" test".to_string())]
);
assert_eq!(
lex("'\\n'".to_string())?,
vec![Token::CharLiteral("\\n".to_string())]
);
assert_eq!(
lex("\"test\"".to_string())?,
vec![Token::StringLiteral("test".to_string())]
);
assert_eq!(
lex(".debug CORE_REG_PC".to_string())?,
vec![
Token::MacroLiteral(".debug".to_string()),
Token::Whitespace(" ".to_string()),
Token::Literal("CORE_REG_PC".to_string())
]
);
Ok(())
}
}