Start writing forth compiler

2022-09-03 15:14:58 +02:00 · 2022-09-03 15:14:58 +02:00 · ec7a147ec9
commit ec7a147ec9
parent 173a857a5a
27 changed files with 790 additions and 221 deletions
--- a/henceforth/Cargo.toml
+++ b/henceforth/Cargo.toml
@ -0,0 +1,24 @@
+[package]
+name = "henceforth"
+version = "0.1.0"
+edition = "2021"
+authors = ["Dominic Grimm <dominic@dergrimm.net>"]
+repository = "https://git.dergrimm.net/dergrimm/hence.git"
+
+[lib]
+name = "henceforth"
+path = "src/lib/lib.rs"
+
+[[bin]]
+name = "henceforth"
+path = "src/bin/main.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+hence = { path = "../hence" }
+clap = { version = "3.2.16", features = ["derive"] }
+anyhow = { version = "1.0.62", features = ["backtrace"] }
+itertools = "0.10.2"
+num-parse = "0.1.2"
+sailfish = "0.4.0"
--- a/henceforth/examples/test.fth
+++ b/henceforth/examples/test.fth
@ -0,0 +1,3 @@
+40 2 + drop drop
+: test ( -- 42 ) 40 2 + ;
+test .
--- a/henceforth/src/bin/main.rs
+++ b/henceforth/src/bin/main.rs
@ -0,0 +1,74 @@
+use anyhow::Result;
+use clap::{Parser, Subcommand};
+use std::fs;
+
+use henceforth::*;
+
+#[derive(Debug, Parser)]
+#[clap(author, version, about, long_about = None)]
+struct Cli {
+    #[clap(subcommand)]
+    commands: Commands,
+}
+
+#[derive(Debug, Subcommand)]
+enum Commands {
+    #[clap(about = "Lexes source code and outputs tokens")]
+    Lex {
+        #[clap(value_parser)]
+        src: String,
+    },
+    #[clap(about = "Parses source code and outputs AST")]
+    Parse {
+        #[clap(value_parser)]
+        src: String,
+    },
+    #[clap(about = "Compiles assembly from source code")]
+    Compile {
+        #[clap(value_parser)]
+        src: String,
+        #[clap(value_parser)]
+        out: Option<String>,
+        #[clap(long, action)]
+        dump: bool,
+    },
+}
+
+fn main() -> Result<()> {
+    let args = Cli::parse();
+    match args.commands {
+        Commands::Lex { src } => {
+            let source = fs::read_to_string(src)?;
+            let tokens = lexer::lex(source)?;
+            dbg!(tokens);
+
+            Ok(())
+        }
+        Commands::Parse { src } => {
+            let source = fs::read_to_string(src)?;
+            let tokens = lexer::lex(source)?;
+            let body = parser::parse(tokens)?;
+            dbg!(body);
+
+            Ok(())
+        }
+        Commands::Compile { src, out, dump } => {
+            let source = fs::read_to_string(&src)?;
+            let tokens = lexer::lex(source)?;
+            let ast = parser::parse(tokens)?;
+            let assembly = compiler::compile(ast)?;
+
+            if let Some(x) = out {
+                fs::write(x, &assembly)?;
+            }
+            if dump {
+                println!("{}", assembly);
+            }
+
+            Ok(())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {}
--- a/henceforth/src/lib/compiler.rs
+++ b/henceforth/src/lib/compiler.rs
@ -0,0 +1,113 @@
+use std::collections::HashMap;
+
+use anyhow::{bail, Result};
+use sailfish::TemplateOnce;
+
+use crate::parser;
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum Instruction {
+    Push(u16),
+
+    Drop,
+    Add,
+    Sub,
+    Dot,
+
+    Call(String),
+}
+
+impl Instruction {
+    pub fn from_word(word: &str) -> Option<Self> {
+        match word {
+            "drop" => Some(Instruction::Drop),
+            "+" => Some(Instruction::Add),
+            "-" => Some(Instruction::Sub),
+            "." => Some(Instruction::Dot),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct Word {
+    id: u16,
+    instructions: Vec<Instruction>,
+}
+
+#[derive(Debug)]
+pub struct Data {
+    instructions: Vec<Instruction>,
+    words: HashMap<String, Word>,
+}
+
+impl Data {
+    fn new() -> Self {
+        Self {
+            instructions: vec![],
+            words: HashMap::new(),
+        }
+    }
+
+    pub fn generate_instructions(&mut self, body: parser::ast::Body) -> Result<()> {
+        for node in body {
+            match node {
+                parser::ast::Node::Comment(_) => {}
+                parser::ast::Node::String { mode, string } => {}
+                parser::ast::Node::Number(x) => {
+                    self.instructions.push(Instruction::Push(x as u16));
+                }
+                parser::ast::Node::WordDefinition {
+                    name,
+                    stack: _,
+                    body,
+                } => {
+                    if Instruction::from_word(&name).is_some() {
+                        bail!("Word already exists as compiler instruction: {}", name);
+                    } else if self.words.contains_key(&name) {
+                        bail!("Word already exists as user word definition: {}", name);
+                    }
+
+                    let pre_instructions = self.instructions.clone();
+                    self.instructions.clear();
+                    self.generate_instructions(body)?;
+                    let instructions = self.instructions.clone();
+                    self.instructions = pre_instructions;
+
+                    self.words.insert(
+                        name,
+                        Word {
+                            id: self.words.len() as u16,
+                            instructions,
+                        },
+                    );
+                }
+                parser::ast::Node::Word(x) => {
+                    if let Some(ins) = Instruction::from_word(&x) {
+                        self.instructions.push(ins);
+                    } else if self.words.contains_key(&x) {
+                        self.instructions.push(Instruction::Call(x));
+                    } else {
+                        bail!("Word does not exist: {}", x);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(TemplateOnce)]
+#[template(path = "assembly.asm")]
+pub struct Template {
+    pub data: Data,
+}
+
+pub fn compile(ast: parser::ast::AST) -> Result<String> {
+    let mut data = Data::new();
+    data.generate_instructions(ast.body)?;
+    dbg!(&data);
+
+    Ok(Template { data }.render_once()?)
+}
--- a/henceforth/src/lib/lexer.rs
+++ b/henceforth/src/lib/lexer.rs
@ -0,0 +1,74 @@
+use anyhow::Result;
+use hence::assembler::ToCode;
+use itertools::Itertools;
+
+#[derive(Debug)]
+pub enum Token {
+    Newline(String),
+    Whitespace(String),
+
+    ParenComment(String),
+    BackslashComment(String),
+    DoubleDashComment(String),
+
+    StringLiteral { mode: String, string: String },
+    Number(String),
+    Word(String),
+}
+
+impl ToCode for Token {
+    fn to_code(&self) -> String {
+        match self {
+            Token::Newline(x) | Token::Whitespace(x) => x.clone(),
+            Token::ParenComment(x) => format!("( {})", x),
+            Token::BackslashComment(x) => format!("\\{}", x),
+            Token::DoubleDashComment(x) => format!("-- {}", x),
+            Token::StringLiteral { mode, string } => format!("{}\" {}\"", mode, string),
+            Token::Number(x) | Token::Word(x) => x.clone(),
+        }
+    }
+}
+
+pub fn is_space(c: char) -> bool {
+    c.is_whitespace() || c == '\n'
+}
+
+pub fn lex(source: String) -> Result<Vec<Token>> {
+    let mut chars = source.chars().peekable();
+    let mut tokens: Vec<Token> = vec![];
+
+    while let Some(&c) = chars.peek() {
+        tokens.push(match c {
+            '\n' => Token::Newline(chars.peeking_take_while(|&c| c == '\n').collect()),
+            _ if c.is_whitespace() => {
+                Token::Whitespace(chars.peeking_take_while(|&c| c.is_whitespace()).collect())
+            }
+            '\\' => {
+                chars.next();
+                Token::BackslashComment(chars.peeking_take_while(|&c| c != '\n').collect())
+            }
+            _ if c.is_numeric() => {
+                Token::Number(chars.peeking_take_while(|&c| !is_space(c)).collect())
+            }
+            _ => {
+                let x: String = chars.peeking_take_while(|&c| !is_space(c)).collect();
+
+                match x.as_str() {
+                    "(" => Token::ParenComment(
+                        chars.by_ref().skip(1).take_while(|&c| c != ')').collect(),
+                    ),
+                    "--" => Token::DoubleDashComment(
+                        chars.by_ref().take_while(|&c| c != '\n').collect(),
+                    ),
+                    _ if x.ends_with('"') => Token::StringLiteral {
+                        mode: x.chars().take(x.len() - 1).collect(),
+                        string: chars.by_ref().skip(1).take_while(|&c| c != '"').collect(),
+                    },
+                    _ => Token::Word(x),
+                }
+            }
+        });
+    }
+
+    Ok(tokens)
+}
--- a/henceforth/src/lib/lib.rs
+++ b/henceforth/src/lib/lib.rs
@ -0,0 +1,3 @@
+pub mod compiler;
+pub mod lexer;
+pub mod parser;
--- a/henceforth/src/lib/parser.rs
+++ b/henceforth/src/lib/parser.rs
@ -0,0 +1,107 @@
+use anyhow::{bail, Result};
+use num_parse;
+
+use crate::lexer;
+
+pub mod ast;
+
+fn process_raw_stack_result(s: Option<&str>) -> Vec<String> {
+    match s {
+        Some(x) if !x.trim().is_empty() => {
+            x.split_whitespace().map(|x| x.trim().to_string()).collect()
+        }
+        _ => vec![],
+    }
+}
+
+pub fn parse_stack_result(s: String) -> ast::StackResult {
+    let mut splitter = s.splitn(2, "--");
+
+    ast::StackResult {
+        before: process_raw_stack_result(splitter.next()),
+        after: process_raw_stack_result(splitter.next()),
+    }
+}
+
+pub fn parse(tokens: Vec<lexer::Token>) -> Result<ast::AST> {
+    let mut iter = tokens.into_iter().peekable();
+    let mut body: ast::Body = vec![];
+
+    while let Some(token) = iter.next() {
+        match token {
+            lexer::Token::Newline(_) | lexer::Token::Whitespace(_) => {}
+            lexer::Token::ParenComment(x)
+            | lexer::Token::BackslashComment(x)
+            | lexer::Token::DoubleDashComment(x) => {
+                body.push(ast::Node::Comment(x.trim().to_string()));
+            }
+            lexer::Token::StringLiteral { mode, string } => {
+                body.push(ast::Node::String { mode, string });
+            }
+            lexer::Token::Number(x) => match num_parse::parse_int::<i32>(x.as_str()) {
+                Some(n) => {
+                    body.push(ast::Node::Number(n));
+                }
+                None => bail!("Invalid number: {}", x),
+            },
+            lexer::Token::Word(x) => match x.as_str() {
+                ":" => {
+                    let mut depth: usize = 1;
+                    let mut content = iter
+                        .by_ref()
+                        .take_while(|t| match t {
+                            lexer::Token::Word(x) => match x.as_str() {
+                                ":" => {
+                                    depth += 1;
+                                    true
+                                }
+                                ";" => {
+                                    depth -= 1;
+                                    depth != 0
+                                }
+                                _ => true,
+                            },
+                            _ => true,
+                        })
+                        .collect::<Vec<_>>()
+                        .into_iter();
+                    if depth != 0 {
+                        bail!("Unbalanced word definitions");
+                    }
+
+                    let name = match content.find(|t| {
+                        !matches!(t, lexer::Token::Newline(_) | lexer::Token::Whitespace(_))
+                    }) {
+                        Some(t) => match t {
+                            lexer::Token::Word(x) => x.clone(),
+                            _ => bail!("Word definition name must be a word itself: {:?}", t),
+                        },
+                        None => bail!("Word definition can not be empty"),
+                    };
+                    let stack = match content.find(|t| {
+                        !matches!(t, lexer::Token::Newline(_) | lexer::Token::Whitespace(_))
+                    }) {
+                        Some(t) => match t {
+                            lexer::Token::ParenComment(x)
+                            | lexer::Token::BackslashComment(x)
+                            | lexer::Token::DoubleDashComment(x) => Some(parse_stack_result(x)),
+                            _ => None,
+                        },
+                        None => None,
+                    };
+
+                    body.push(ast::Node::WordDefinition {
+                        name,
+                        stack,
+                        body: parse(content.collect())?.body,
+                    });
+                }
+                _ => {
+                    body.push(ast::Node::Word(x));
+                }
+            },
+        }
+    }
+
+    Ok(ast::AST { body })
+}
--- a/henceforth/src/lib/parser/ast.rs
+++ b/henceforth/src/lib/parser/ast.rs
@ -0,0 +1,75 @@
+use hence::assembler::ToCode;
+use itertools::Itertools;
+
+#[derive(Debug)]
+pub struct StackResult {
+    pub before: Vec<String>,
+    pub after: Vec<String>,
+}
+
+impl ToCode for StackResult {
+    fn to_code(&self) -> String {
+        format!(
+            "{}--{}",
+            if self.before.is_empty() {
+                "".to_string()
+            } else {
+                format!("{} ", self.before.join(" "))
+            },
+            if self.after.is_empty() {
+                "".to_string()
+            } else {
+                format!("{} ", self.after.join(" "))
+            }
+        )
+    }
+}
+
+#[derive(Debug)]
+pub enum Node {
+    Comment(String),
+    String {
+        mode: String,
+        string: String,
+    },
+    Number(i32),
+    WordDefinition {
+        name: String,
+        stack: Option<StackResult>,
+        body: Body,
+    },
+    Word(String),
+}
+
+impl ToCode for Node {
+    fn to_code(&self) -> String {
+        match self {
+            Node::Comment(x) => format!("\\ {}", x),
+            Node::String { mode, string } => format!("{}\" {}\"", mode, string),
+            Node::Number(x) => x.to_string(),
+            Node::WordDefinition { name, stack, body } => format!(
+                ": {}{} {} ;",
+                name,
+                match stack {
+                    Some(x) => format!(" {}", x.to_code()),
+                    None => "".to_string(),
+                },
+                body.iter().map(|x| x.to_code()).join(" ")
+            ),
+            Node::Word(x) => x.clone(),
+        }
+    }
+}
+
+pub type Body = Vec<Node>;
+
+#[derive(Debug)]
+pub struct AST {
+    pub body: Body,
+}
+
+impl ToCode for AST {
+    fn to_code(&self) -> String {
+        self.body.iter().map(|x| x.to_code()).join(" ")
+    }
+}
--- a/henceforth/templates/assembly.asm
+++ b/henceforth/templates/assembly.asm
@ -0,0 +1,10 @@
+.include "$lib/core.asm"
+.include "$lib/std.asm"
+.include "$lib/main.asm"
+
+.jump_main
+
+data:
+
+.main
+.std_stop