hence/hence/src/lib/lexer.rs

use anyhow::{bail, Result};
use itertools::Itertools;

use crate::assembler;

#[derive(Debug, PartialEq, Eq)]
pub enum Token {
    Comment(String),

    CharLiteral(String),
    StringLiteral(String),
    MacroLiteral(String),
    Literal(String),
    Number(String),

    Comma,
    Colon,
    LParen,
    RParen,

    Not,
    And,
    Nand,
    Or,
    Nor,
    Xor,
    Xnor,
    Lsh,
    Rsh,
    Add,
    Sub,
    Mul,
    Div,
    Pow,
    Cmp,
    Eq,
    Neq,
    Lt,
    Gt,
    Leq,
    Geq,
    Bol,
    Inv,
    Rnd,

    Newline(String),
    Whitespace(String),
}

impl assembler::ToCode for Token {
    fn to_code(&self) -> String {
        match self {
            Token::Comment(x) => format!(";{}", x),
            Token::CharLiteral(x) => format!("'{}'", x),
            Token::StringLiteral(x) => format!("\"{}\"", x),
            Token::MacroLiteral(x) => x.clone(),
            Token::Literal(x) => x.clone(),
            Token::Number(x) => x.clone(),
            Token::Comma => ",".to_string(),
            Token::Colon => ":".to_string(),
            Token::LParen => "(".to_string(),
            Token::RParen => ")".to_string(),
            Token::Not => "~".to_string(),
            Token::And => "&".to_string(),
            Token::Nand => "~&".to_string(),
            Token::Or => "|".to_string(),
            Token::Nor => "~|".to_string(),
            Token::Xor => "^".to_string(),
            Token::Xnor => "~^".to_string(),
            Token::Lsh => "<<".to_string(),
            Token::Rsh => ">>".to_string(),
            Token::Add => "+".to_string(),
            Token::Sub => "-".to_string(),
            Token::Mul => "*".to_string(),
            Token::Div => "/".to_string(),
            Token::Pow => "**".to_string(),
            Token::Cmp => "<=>".to_string(),
            Token::Eq => "==".to_string(),
            Token::Neq => "!=".to_string(),
            Token::Lt => "<".to_string(),
            Token::Gt => ">".to_string(),
            Token::Leq => "<=".to_string(),
            Token::Geq => ">=".to_string(),
            Token::Bol => "!!".to_string(),
            Token::Inv => "!".to_string(),
            Token::Rnd => "?".to_string(),
            Token::Newline(x) | Token::Whitespace(x) => x.clone(),
        }
    }
}

pub fn lex(source: String) -> Result<Vec<Token>> {
    let mut chars = source.chars().peekable();
    let mut tokens: Vec<Token> = vec![];

    while let Some(&ch) = chars.peek() {
        tokens.push(match ch {
            ';' => {
                chars.next();
                chars.next_if(|c| *c == ';');
                Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
            }
            '@' => {
                chars.next();
                Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
            }
            '#' => {
                chars.next();
                Token::Comment(chars.peeking_take_while(|c| *c != '\n').collect())
            }
            '\'' => {
                chars.next();
                Token::CharLiteral(chars.by_ref().take_while(|c| *c != '\'').collect())
            }
            '"' => {
                chars.next();
                Token::StringLiteral(chars.by_ref().take_while(|c| *c != '"').collect())
            }
            '.' => {
                chars.next();
                Token::MacroLiteral(format!(
                    ".{}",
                    chars
                        .peeking_take_while(|c| c.is_alphabetic() || c.is_numeric() || *c == '_')
                        .collect::<String>()
                ))
            }
            ch if ch.is_alphabetic() => {
                let name: String = chars
                    .peeking_take_while(|c| c.is_alphabetic() || c.is_numeric() || *c == '_')
                    .collect();

                Token::Literal(name)
            }
            ch if ch.is_numeric() => Token::Number(
                chars
                    .peeking_take_while(|c| c.is_alphanumeric() || *c == '_')
                    .collect(),
            ),
            ',' => {
                chars.next();
                Token::Comma
            }
            ':' => {
                chars.next();
                Token::Colon
            }
            '(' => {
                chars.next();
                Token::LParen
            }
            ')' => {
                chars.next();
                Token::RParen
            }
            '~' => {
                chars.next();
                if let Some(c) = chars.peek() {
                    match c {
                        '&' => {
                            chars.next();
                            Token::Nand
                        }
                        '|' => {
                            chars.next();
                            Token::Nor
                        }
                        '^' => {
                            chars.next();
                            Token::Xnor
                        }
                        _ => Token::Not,
                    }
                } else {
                    Token::Not
                }
            }
            '&' => {
                chars.next();
                Token::And
            }
            '|' => {
                chars.next();
                Token::Or
            }
            '^' => {
                chars.next();
                Token::Xor
            }
            '<' => {
                chars.next();
                match chars.peek() {
                    Some('<') => {
                        chars.next();
                        Token::Lsh
                    }
                    Some('=') => {
                        chars.next();
                        match chars.peek() {
                            Some('>') => {
                                chars.next();
                                Token::Cmp
                            }
                            _ => Token::Leq,
                        }
                    }
                    _ => Token::Lt,
                }
            }
            '>' => {
                chars.next();
                match chars.peek() {
                    Some('>') => {
                        chars.next();
                        Token::Rsh
                    }
                    Some('=') => {
                        chars.next();
                        Token::Geq
                    }
                    _ => Token::Gt,
                }
            }
            '+' => {
                chars.next();
                Token::Add
            }
            '-' => {
                chars.next();
                Token::Sub
            }
            '*' => {
                chars.next();
                if let Some('*') = chars.peek() {
                    chars.next();
                    Token::Pow
                } else {
                    Token::Mul
                }
            }
            '/' => {
                chars.next();
                Token::Div
            }
            '=' => {
                chars.next();
                if let Some('=') = chars.peek() {
                    chars.next();
                }
                Token::Eq
            }
            '!' => {
                chars.next();
                match chars.peek() {
                    Some('!') => {
                        chars.next();
                        Token::Bol
                    }
                    Some('=') => {
                        chars.next();
                        Token::Neq
                    }
                    _ => Token::Inv,
                }
            }
            '?' => {
                chars.next();
                Token::Rnd
            }
            '\n' => Token::Newline(chars.peeking_take_while(|c| *c == '\n').collect()),
            ch if ch.is_whitespace() => Token::Whitespace(
                chars
                    .peeking_take_while(|c| c.is_whitespace() && *c != '\n')
                    .collect(),
            ),
            _ => bail!("Unexpected token: {}", ch),
        });
    }

    Ok(tokens)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::assembler::ToCode;

    #[test]
    fn test_token_to_assembly() {
        assert_eq!(
            Token::Comment(" \"main function\" like definition macro".to_string()).to_code(),
            "; \"main function\" like definition macro".to_string()
        );
        assert_eq!(
            Token::CharLiteral("\\n".to_string()).to_code(),
            "'\\n'".to_string()
        );
        assert_eq!(
            Token::MacroLiteral("xyz".to_string()).to_code(),
            "xyz".to_string()
        );
        assert_eq!(
            Token::Literal("xkcd".to_string()).to_code(),
            "xkcd".to_string()
        );
        assert_eq!(Token::Newline("\n".to_string()).to_code(), "\n".to_string());
        assert_eq!(
            Token::Whitespace("     ".to_string()).to_code(),
            "     ".to_string()
        );
    }

    #[test]
    fn test_lex() -> Result<()> {
        assert_eq!(
            lex(";; test".to_string())?,
            vec![Token::Comment(" test".to_string())]
        );
        assert_eq!(
            lex("@ test".to_string())?,
            vec![Token::Comment(" test".to_string())]
        );
        assert_eq!(
            lex("# test".to_string())?,
            vec![Token::Comment(" test".to_string())]
        );
        assert_eq!(
            lex("'\\n'".to_string())?,
            vec![Token::CharLiteral("\\n".to_string())]
        );
        assert_eq!(
            lex("\"test\"".to_string())?,
            vec![Token::StringLiteral("test".to_string())]
        );
        assert_eq!(
            lex(".debug CORE_REG_PC".to_string())?,
            vec![
                Token::MacroLiteral(".debug".to_string()),
                Token::Whitespace(" ".to_string()),
                Token::Literal("CORE_REG_PC".to_string())
            ]
        );

        Ok(())
    }
}
Add working macro system 2022-09-02 09:33:46 +00:00			`use anyhow::{bail, Result};`
			`use itertools::Itertools;`

Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`use crate::assembler;`

Start writing forth compiler 2022-09-03 13:14:58 +00:00			`#[derive(Debug, PartialEq, Eq)]`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`pub enum Token {`
			`Comment(String),`

Add working macro system 2022-09-02 09:33:46 +00:00			`CharLiteral(String),`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`StringLiteral(String),`
			`MacroLiteral(String),`
			`Literal(String),`
			`Number(String),`

			`Comma,`
			`Colon,`
			`LParen,`
			`RParen,`

Add working macro system 2022-09-02 09:33:46 +00:00			`Not,`
			`And,`
			`Nand,`
			`Or,`
			`Nor,`
			`Xor,`
			`Xnor,`
			`Lsh,`
			`Rsh,`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`Add,`
			`Sub,`
			`Mul,`
			`Div,`
			`Pow,`
Add working macro system 2022-09-02 09:33:46 +00:00			`Cmp,`
			`Eq,`
			`Neq,`
			`Lt,`
			`Gt,`
			`Leq,`
			`Geq,`
			`Bol,`
			`Inv,`
			`Rnd,`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00
			`Newline(String),`
			`Whitespace(String),`
			`}`

Start writing forth compiler 2022-09-03 13:14:58 +00:00			`impl assembler::ToCode for Token {`
			`fn to_code(&self) -> String {`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`match self {`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Comment(x) => format!(";{}", x),`
			`Token::CharLiteral(x) => format!("'{}'", x),`
			`Token::StringLiteral(x) => format!("\"{}\"", x),`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`Token::MacroLiteral(x) => x.clone(),`
			`Token::Literal(x) => x.clone(),`
			`Token::Number(x) => x.clone(),`
			`Token::Comma => ",".to_string(),`
			`Token::Colon => ":".to_string(),`
			`Token::LParen => "(".to_string(),`
			`Token::RParen => ")".to_string(),`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Not => "~".to_string(),`
			`Token::And => "&".to_string(),`
			`Token::Nand => "~&".to_string(),`
			`Token::Or => "\|".to_string(),`
			`Token::Nor => "~\|".to_string(),`
			`Token::Xor => "^".to_string(),`
			`Token::Xnor => "~^".to_string(),`
			`Token::Lsh => "<<".to_string(),`
			`Token::Rsh => ">>".to_string(),`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`Token::Add => "+".to_string(),`
			`Token::Sub => "-".to_string(),`
			`Token::Mul => "*".to_string(),`
			`Token::Div => "/".to_string(),`
			`Token::Pow => "**".to_string(),`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Cmp => "<=>".to_string(),`
			`Token::Eq => "==".to_string(),`
			`Token::Neq => "!=".to_string(),`
			`Token::Lt => "<".to_string(),`
			`Token::Gt => ">".to_string(),`
			`Token::Leq => "<=".to_string(),`
			`Token::Geq => ">=".to_string(),`
			`Token::Bol => "!!".to_string(),`
			`Token::Inv => "!".to_string(),`
			`Token::Rnd => "?".to_string(),`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`Token::Newline(x) \| Token::Whitespace(x) => x.clone(),`
			`}`
			`}`
			`}`

Add working macro system 2022-09-02 09:33:46 +00:00			`pub fn lex(source: String) -> Result<Vec<Token>> {`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`let mut chars = source.chars().peekable();`
Add working macro system 2022-09-02 09:33:46 +00:00			`let mut tokens: Vec<Token> = vec![];`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00
			`while let Some(&ch) = chars.peek() {`
Add working macro system 2022-09-02 09:33:46 +00:00			`tokens.push(match ch {`
			`';' => {`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`chars.next_if(\|c\| *c == ';');`
			`Token::Comment(chars.peeking_take_while(\|c\| *c != '\n').collect())`
			`}`
			`'@' => {`
			`chars.next();`
			`Token::Comment(chars.peeking_take_while(\|c\| *c != '\n').collect())`
			`}`
			`'#' => {`
			`chars.next();`
			`Token::Comment(chars.peeking_take_while(\|c\| *c != '\n').collect())`
			`}`
			`'\'' => {`
			`chars.next();`
			`Token::CharLiteral(chars.by_ref().take_while(\|c\| *c != '\'').collect())`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`'"' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::StringLiteral(chars.by_ref().take_while(\|c\| *c != '"').collect())`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`'.' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::MacroLiteral(format!(`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`".{}",`
			`chars`
Upgrades upgrades upgrades 2022-08-23 14:20:38 +00:00			`.peeking_take_while(\|c\| c.is_alphabetic() \|\| c.is_numeric() \|\| *c == '_')`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`.collect::<String>()`
Add working macro system 2022-09-02 09:33:46 +00:00			`))`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`ch if ch.is_alphabetic() => {`
			`let name: String = chars`
Upgrades upgrades upgrades 2022-08-23 14:20:38 +00:00			`.peeking_take_while(\|c\| c.is_alphabetic() \|\| c.is_numeric() \|\| *c == '_')`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`.collect();`

Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Literal(name)`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
Add working macro system 2022-09-02 09:33:46 +00:00			`ch if ch.is_numeric() => Token::Number(`
			`chars`
			`.peeking_take_while(\|c\| c.is_alphanumeric() \|\| *c == '_')`
			`.collect(),`
			`),`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`',' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Comma`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`':' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Colon`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`'(' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::LParen`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`')' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::RParen`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
Add working macro system 2022-09-02 09:33:46 +00:00			`'~' => {`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`if let Some(c) = chars.peek() {`
			`match c {`
			`'&' => {`
			`chars.next();`
			`Token::Nand`
			`}`
			`'\|' => {`
			`chars.next();`
			`Token::Nor`
			`}`
			`'^' => {`
			`chars.next();`
			`Token::Xnor`
			`}`
			`_ => Token::Not,`
			`}`
			`} else {`
			`Token::Not`
			`}`
			`}`
			`'&' => {`
			`chars.next();`
			`Token::And`
			`}`
			`'\|' => {`
			`chars.next();`
			`Token::Or`
			`}`
			`'^' => {`
			`chars.next();`
			`Token::Xor`
			`}`
			`'<' => {`
			`chars.next();`
			`match chars.peek() {`
			`Some('<') => {`
			`chars.next();`
			`Token::Lsh`
			`}`
			`Some('=') => {`
			`chars.next();`
			`match chars.peek() {`
			`Some('>') => {`
			`chars.next();`
			`Token::Cmp`
			`}`
			`_ => Token::Leq,`
			`}`
			`}`
			`_ => Token::Lt,`
			`}`
			`}`
			`'>' => {`
			`chars.next();`
			`match chars.peek() {`
			`Some('>') => {`
			`chars.next();`
			`Token::Rsh`
			`}`
			`Some('=') => {`
			`chars.next();`
			`Token::Geq`
			`}`
			`_ => Token::Gt,`
			`}`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`'+' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Add`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`'-' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Sub`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`'*' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`if let Some('*') = chars.peek() {`
			`chars.next();`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`Token::Pow`
			`} else {`
			`Token::Mul`
Add working macro system 2022-09-02 09:33:46 +00:00			`}`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
			`'/' => {`
			`chars.next();`
Add working macro system 2022-09-02 09:33:46 +00:00			`Token::Div`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
Add working macro system 2022-09-02 09:33:46 +00:00			`'=' => {`
			`chars.next();`
			`if let Some('=') = chars.peek() {`
			`chars.next();`
			`}`
			`Token::Eq`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
Add working macro system 2022-09-02 09:33:46 +00:00			`'!' => {`
			`chars.next();`
			`match chars.peek() {`
			`Some('!') => {`
			`chars.next();`
			`Token::Bol`
			`}`
			`Some('=') => {`
			`chars.next();`
			`Token::Neq`
			`}`
			`_ => Token::Inv,`
			`}`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
Add working macro system 2022-09-02 09:33:46 +00:00			`'?' => {`
			`chars.next();`
			`Token::Rnd`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`
Add working macro system 2022-09-02 09:33:46 +00:00			`'\n' => Token::Newline(chars.peeking_take_while(\|c\| *c == '\n').collect()),`
			`ch if ch.is_whitespace() => Token::Whitespace(`
			`chars`
			`.peeking_take_while(\|c\| c.is_whitespace() && *c != '\n')`
			`.collect(),`
			`),`
			`_ => bail!("Unexpected token: {}", ch),`
			`});`
Rewrite everything in rust 2022-07-17 18:24:49 +00:00			`}`

			`Ok(tokens)`
			`}`
Add working macro system 2022-09-02 09:33:46 +00:00
			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
Start writing forth compiler 2022-09-03 13:14:58 +00:00			`use crate::assembler::ToCode;`
Add working macro system 2022-09-02 09:33:46 +00:00
			`#[test]`
			`fn test_token_to_assembly() {`
			`assert_eq!(`
Start writing forth compiler 2022-09-03 13:14:58 +00:00			`Token::Comment(" \"main function\" like definition macro".to_string()).to_code(),`
Add working macro system 2022-09-02 09:33:46 +00:00			`"; \"main function\" like definition macro".to_string()`
			`);`
			`assert_eq!(`
Start writing forth compiler 2022-09-03 13:14:58 +00:00			`Token::CharLiteral("\\n".to_string()).to_code(),`
Add working macro system 2022-09-02 09:33:46 +00:00			`"'\\n'".to_string()`
			`);`
			`assert_eq!(`
Start writing forth compiler 2022-09-03 13:14:58 +00:00			`Token::MacroLiteral("xyz".to_string()).to_code(),`
Add working macro system 2022-09-02 09:33:46 +00:00			`"xyz".to_string()`
			`);`
			`assert_eq!(`
Start writing forth compiler 2022-09-03 13:14:58 +00:00			`Token::Literal("xkcd".to_string()).to_code(),`
Add working macro system 2022-09-02 09:33:46 +00:00			`"xkcd".to_string()`
			`);`
Start writing forth compiler 2022-09-03 13:14:58 +00:00			`assert_eq!(Token::Newline("\n".to_string()).to_code(), "\n".to_string());`
Add working macro system 2022-09-02 09:33:46 +00:00			`assert_eq!(`
Start writing forth compiler 2022-09-03 13:14:58 +00:00			`Token::Whitespace(" ".to_string()).to_code(),`
Add working macro system 2022-09-02 09:33:46 +00:00			`" ".to_string()`
			`);`
			`}`

			`#[test]`
			`fn test_lex() -> Result<()> {`
			`assert_eq!(`
			`lex(";; test".to_string())?,`
			`vec![Token::Comment(" test".to_string())]`
			`);`
			`assert_eq!(`
			`lex("@ test".to_string())?,`
			`vec![Token::Comment(" test".to_string())]`
			`);`
			`assert_eq!(`
			`lex("# test".to_string())?,`
			`vec![Token::Comment(" test".to_string())]`
			`);`
			`assert_eq!(`
			`lex("'\\n'".to_string())?,`
			`vec![Token::CharLiteral("\\n".to_string())]`
			`);`
			`assert_eq!(`
			`lex("\"test\"".to_string())?,`
			`vec![Token::StringLiteral("test".to_string())]`
			`);`
			`assert_eq!(`
			`lex(".debug CORE_REG_PC".to_string())?,`
			`vec![`
			`Token::MacroLiteral(".debug".to_string()),`
			`Token::Whitespace(" ".to_string()),`
			`Token::Literal("CORE_REG_PC".to_string())`
			`]`
			`);`

			`Ok(())`
			`}`
			`}`