hence/hence/src/lib/lexer.rs

347 lines
9.8 KiB
Rust
Raw Normal View History

2022-09-02 09:33:46 +00:00
use anyhow::{bail, Result};
use itertools::Itertools;
2022-07-17 18:24:49 +00:00
use crate::assembler;
2022-09-03 13:14:58 +00:00
#[derive(Debug, PartialEq, Eq)]
2022-07-17 18:24:49 +00:00
pub enum Token {
Comment(String),
2022-09-02 09:33:46 +00:00
CharLiteral(String),
2022-07-17 18:24:49 +00:00
StringLiteral(String),
MacroLiteral(String),
Literal(String),
Number(String),
Comma,
Colon,
LParen,
RParen,
2022-09-02 09:33:46 +00:00
Not,
And,
Nand,
Or,
Nor,
Xor,
Xnor,
Lsh,
Rsh,
2022-07-17 18:24:49 +00:00
Add,
Sub,
Mul,
Div,
Pow,
2022-09-02 09:33:46 +00:00
Cmp,
Eq,
Neq,
Lt,
Gt,
Leq,
Geq,
Bol,
Inv,
Rnd,
2022-07-17 18:24:49 +00:00
Newline(String),
Whitespace(String),
}
2022-09-03 13:14:58 +00:00
impl assembler::ToCode for Token {
fn to_code(&self) -> String {
2022-07-17 18:24:49 +00:00
match self {
2022-09-02 09:33:46 +00:00
Token::Comment(x) => format!(";{}", x),
Token::CharLiteral(x) => format!("'{}'", x),
Token::StringLiteral(x) => format!("\"{}\"", x),
2022-07-17 18:24:49 +00:00
Token::MacroLiteral(x) => x.clone(),
Token::Literal(x) => x.clone(),
Token::Number(x) => x.clone(),
Token::Comma => ",".to_string(),
Token::Colon => ":".to_string(),
Token::LParen => "(".to_string(),
Token::RParen => ")".to_string(),
2022-09-02 09:33:46 +00:00
Token::Not => "~".to_string(),
Token::And => "&".to_string(),
Token::Nand => "~&".to_string(),
Token::Or => "|".to_string(),
Token::Nor => "~|".to_string(),
Token::Xor => "^".to_string(),
Token::Xnor => "~^".to_string(),
Token::Lsh => "<<".to_string(),
Token::Rsh => ">>".to_string(),
2022-07-17 18:24:49 +00:00
Token::Add => "+".to_string(),
Token::Sub => "-".to_string(),
Token::Mul => "*".to_string(),
Token::Div => "/".to_string(),
Token::Pow => "**".to_string(),
2022-09-02 09:33:46 +00:00
Token::Cmp => "<=>".to_string(),
Token::Eq => "==".to_string(),
Token::Neq => "!=".to_string(),
Token::Lt => "<".to_string(),
Token::Gt => ">".to_string(),
Token::Leq => "<=".to_string(),
Token::Geq => ">=".to_string(),
Token::Bol => "!!".to_string(),
Token::Inv => "!".to_string(),
Token::Rnd => "?".to_string(),
2022-07-17 18:24:49 +00:00
Token::Newline(x) | Token::Whitespace(x) => x.clone(),
}
}
}
2022-09-02 09:33:46 +00:00
pub fn lex(source: String) -> Result<Vec<Token>> {
2022-07-17 18:24:49 +00:00
let mut chars = source.chars().peekable();
2022-09-02 09:33:46 +00:00
let mut tokens: Vec<Token> = vec![];
2022-07-17 18:24:49 +00:00
while let Some(&ch) = chars.peek() {
2022-09-02 09:33:46 +00:00
tokens.push(match ch {
';' => {
2022-07-17 18:24:49 +00:00
chars.next();
2022-09-02 09:33:46 +00:00
chars.next_if(|c| *c == ';');
Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
}
'@' => {
chars.next();
Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
}
'#' => {
chars.next();
Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
}
'\'' => {
chars.next();
Token::CharLiteral(chars.by_ref().take_while(|c| *c != '\'').collect())
2022-07-17 18:24:49 +00:00
}
'"' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::StringLiteral(chars.by_ref().take_while(|c| *c != '"').collect())
2022-07-17 18:24:49 +00:00
}
'.' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::MacroLiteral(format!(
2022-07-17 18:24:49 +00:00
".{}",
chars
2022-08-23 14:20:38 +00:00
.peeking_take_while(|c| c.is_alphabetic() || c.is_numeric() || *c == '_')
2022-07-17 18:24:49 +00:00
.collect::<String>()
2022-09-02 09:33:46 +00:00
))
2022-07-17 18:24:49 +00:00
}
ch if ch.is_alphabetic() => {
let name: String = chars
2022-08-23 14:20:38 +00:00
.peeking_take_while(|c| c.is_alphabetic() || c.is_numeric() || *c == '_')
2022-07-17 18:24:49 +00:00
.collect();
2022-09-02 09:33:46 +00:00
Token::Literal(name)
2022-07-17 18:24:49 +00:00
}
2022-09-02 09:33:46 +00:00
ch if ch.is_numeric() => Token::Number(
chars
.peeking_take_while(|c| c.is_alphanumeric() || *c == '_')
.collect(),
),
2022-07-17 18:24:49 +00:00
',' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::Comma
2022-07-17 18:24:49 +00:00
}
':' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::Colon
2022-07-17 18:24:49 +00:00
}
'(' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::LParen
2022-07-17 18:24:49 +00:00
}
')' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::RParen
2022-07-17 18:24:49 +00:00
}
2022-09-02 09:33:46 +00:00
'~' => {
2022-07-17 18:24:49 +00:00
chars.next();
2022-09-02 09:33:46 +00:00
if let Some(c) = chars.peek() {
match c {
'&' => {
chars.next();
Token::Nand
}
'|' => {
chars.next();
Token::Nor
}
'^' => {
chars.next();
Token::Xnor
}
_ => Token::Not,
}
} else {
Token::Not
}
}
'&' => {
chars.next();
Token::And
}
'|' => {
chars.next();
Token::Or
}
'^' => {
chars.next();
Token::Xor
}
'<' => {
chars.next();
match chars.peek() {
Some('<') => {
chars.next();
Token::Lsh
}
Some('=') => {
chars.next();
match chars.peek() {
Some('>') => {
chars.next();
Token::Cmp
}
_ => Token::Leq,
}
}
_ => Token::Lt,
}
}
'>' => {
chars.next();
match chars.peek() {
Some('>') => {
chars.next();
Token::Rsh
}
Some('=') => {
chars.next();
Token::Geq
}
_ => Token::Gt,
}
2022-07-17 18:24:49 +00:00
}
'+' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::Add
2022-07-17 18:24:49 +00:00
}
'-' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::Sub
2022-07-17 18:24:49 +00:00
}
'*' => {
chars.next();
2022-09-02 09:33:46 +00:00
if let Some('*') = chars.peek() {
chars.next();
2022-07-17 18:24:49 +00:00
Token::Pow
} else {
Token::Mul
2022-09-02 09:33:46 +00:00
}
2022-07-17 18:24:49 +00:00
}
'/' => {
chars.next();
2022-09-02 09:33:46 +00:00
Token::Div
2022-07-17 18:24:49 +00:00
}
2022-09-02 09:33:46 +00:00
'=' => {
chars.next();
if let Some('=') = chars.peek() {
chars.next();
}
Token::Eq
2022-07-17 18:24:49 +00:00
}
2022-09-02 09:33:46 +00:00
'!' => {
chars.next();
match chars.peek() {
Some('!') => {
chars.next();
Token::Bol
}
Some('=') => {
chars.next();
Token::Neq
}
_ => Token::Inv,
}
2022-07-17 18:24:49 +00:00
}
2022-09-02 09:33:46 +00:00
'?' => {
chars.next();
Token::Rnd
2022-07-17 18:24:49 +00:00
}
2022-09-02 09:33:46 +00:00
'\n' => Token::Newline(chars.peeking_take_while(|c| *c == '\n').collect()),
ch if ch.is_whitespace() => Token::Whitespace(
chars
.peeking_take_while(|c| c.is_whitespace() && *c != '\n')
.collect(),
),
_ => bail!("Unexpected token: {}", ch),
});
2022-07-17 18:24:49 +00:00
}
Ok(tokens)
}
2022-09-02 09:33:46 +00:00
#[cfg(test)]
mod tests {
use super::*;
2022-09-03 13:14:58 +00:00
use crate::assembler::ToCode;
2022-09-02 09:33:46 +00:00
#[test]
fn test_token_to_assembly() {
assert_eq!(
2022-09-03 13:14:58 +00:00
Token::Comment(" \"main function\" like definition macro".to_string()).to_code(),
2022-09-02 09:33:46 +00:00
"; \"main function\" like definition macro".to_string()
);
assert_eq!(
2022-09-03 13:14:58 +00:00
Token::CharLiteral("\\n".to_string()).to_code(),
2022-09-02 09:33:46 +00:00
"'\\n'".to_string()
);
assert_eq!(
2022-09-03 13:14:58 +00:00
Token::MacroLiteral("xyz".to_string()).to_code(),
2022-09-02 09:33:46 +00:00
"xyz".to_string()
);
assert_eq!(
2022-09-03 13:14:58 +00:00
Token::Literal("xkcd".to_string()).to_code(),
2022-09-02 09:33:46 +00:00
"xkcd".to_string()
);
2022-09-03 13:14:58 +00:00
assert_eq!(Token::Newline("\n".to_string()).to_code(), "\n".to_string());
2022-09-02 09:33:46 +00:00
assert_eq!(
2022-09-03 13:14:58 +00:00
Token::Whitespace(" ".to_string()).to_code(),
2022-09-02 09:33:46 +00:00
" ".to_string()
);
}
#[test]
fn test_lex() -> Result<()> {
assert_eq!(
lex(";; test".to_string())?,
vec![Token::Comment(" test".to_string())]
);
assert_eq!(
lex("@ test".to_string())?,
vec![Token::Comment(" test".to_string())]
);
assert_eq!(
lex("# test".to_string())?,
vec![Token::Comment(" test".to_string())]
);
assert_eq!(
lex("'\\n'".to_string())?,
vec![Token::CharLiteral("\\n".to_string())]
);
assert_eq!(
lex("\"test\"".to_string())?,
vec![Token::StringLiteral("test".to_string())]
);
assert_eq!(
lex(".debug CORE_REG_PC".to_string())?,
vec![
Token::MacroLiteral(".debug".to_string()),
Token::Whitespace(" ".to_string()),
Token::Literal("CORE_REG_PC".to_string())
]
);
Ok(())
}
}