hence/henceforth/src/lib/lexer.rs

75 lines
2.4 KiB
Rust

use anyhow::Result;
use hence::assembler::ToCode;
use itertools::Itertools;
#[derive(Debug)]
pub enum Token {
Newline(String),
Whitespace(String),
ParenComment(String),
BackslashComment(String),
DoubleDashComment(String),
StringLiteral { mode: String, string: String },
Number(String),
Word(String),
}
impl ToCode for Token {
fn to_code(&self) -> String {
match self {
Token::Newline(x) | Token::Whitespace(x) => x.clone(),
Token::ParenComment(x) => format!("( {})", x),
Token::BackslashComment(x) => format!("\\{}", x),
Token::DoubleDashComment(x) => format!("-- {}", x),
Token::StringLiteral { mode, string } => format!("{}\" {}\"", mode, string),
Token::Number(x) | Token::Word(x) => x.clone(),
}
}
}
pub fn is_space(c: char) -> bool {
c.is_whitespace() || c == '\n'
}
pub fn lex(source: String) -> Result<Vec<Token>> {
let mut chars = source.chars().peekable();
let mut tokens: Vec<Token> = vec![];
while let Some(&c) = chars.peek() {
tokens.push(match c {
'\n' => Token::Newline(chars.peeking_take_while(|&c| c == '\n').collect()),
_ if c.is_whitespace() => {
Token::Whitespace(chars.peeking_take_while(|&c| c.is_whitespace()).collect())
}
'\\' => {
chars.next();
Token::BackslashComment(chars.peeking_take_while(|&c| c != '\n').collect())
}
_ if c.is_numeric() => {
Token::Number(chars.peeking_take_while(|&c| !is_space(c)).collect())
}
_ => {
let x: String = chars.peeking_take_while(|&c| !is_space(c)).collect();
match x.as_str() {
"(" => Token::ParenComment(
chars.by_ref().skip(1).take_while(|&c| c != ')').collect(),
),
"--" => Token::DoubleDashComment(
chars.by_ref().take_while(|&c| c != '\n').collect(),
),
_ if x.ends_with('"') => Token::StringLiteral {
mode: x.chars().take(x.len() - 1).collect(),
string: chars.by_ref().skip(1).take_while(|&c| c != '"').collect(),
},
_ => Token::Word(x),
}
}
});
}
Ok(tokens)
}