hydroforth/src/hydroforth/lexer.c

#include <stdlib.h>
#include <string.h>

#include "hydroforth/hydroforth.h"

void token_array_push(struct hf__token **arr, size_t *const len,
                      size_t *const size, struct hf__token item) {
  if (*len > *size) {
    return;
  } else if (*len == *size) {
    *size += 1 + (*size / 2);
    *arr = realloc(*arr, sizeof(struct hf__token) * (*size));
  }

  (*arr)[*len] = item;
  (*len)++;
}

void hf__lex(const char *const src, const size_t src_len,
             struct hf__token **tokens, size_t *const len, size_t *const size) {
  size_t i = 0;
  while (i < src_len) {
    if (hf__is_space_like(src[i]) || src[i] == '\n') {
      i++;
      continue;
    }

    size_t start = i;
    struct hf__token token;

    if (src[i] == '\'') {
      const size_t char_start = start;
      i++;
      start = i;
      while (src[i] != '\'') {
        i++;
        if (i >= src_len) {
          start = char_start;
          goto TOKEN_IS_WORD;
        }
      }

      token.type = HF__TOKEN_TYPE__CHAR;
      token.location.start = start;
      token.location.end = i - 1;

      i++;
    } else {
      while (!hf__is_space_like(src[i]) && src[i] != '\n' && i < src_len) {
        i++;
      }
      const size_t str_len = i - start;

      if (hf__is_numeric(src[start]) || (src[start] == '-' && str_len > 1 &&
                                         hf__is_numeric(src[start + 1]))) {
        token.type = HF__TOKEN_TYPE__NUMBER;
        token.location.start = start;
        token.location.end = i - 1;
      } else if (str_len == 1 && src[start] == ':') {
        token.type = HF__TOKEN_TYPE__COLON;
        token.location.start = start;
        token.location.end = i - 1;
      } else if (str_len == 1 && src[start] == ';') {
        token.type = HF__TOKEN_TYPE__SEMICOLON;
        token.location.start = start;
        token.location.end = i - 1;
      } else if (str_len == 1 && src[start] == '(' &&
                 hf__is_space_like(src[i])) {
        i++;
        bool got_end = false;
        while (i < src_len) {
          if (src[i] == ')' && hf__is_space_like(src[i - 1])) {
            got_end = true;
            break;
          }
          i++;
        }

        if (got_end) {
          token.type = HF__TOKEN_TYPE__PAREN_COMMENT;
          token.location.start = start + 2;
          token.location.end = i - 2;
          i++;
        } else {
          i = start + 1;
          goto TOKEN_IS_WORD;
        }
      } else if (str_len == 1 && src[start] == '\\' &&
                 hf__is_space_like(src[i])) {
        token.type = HF__TOKEN_TYPE__BACKSLASH_COMMENT;

        start = ++i;
        while (src[i] != '\n' && i < src_len) {
          i++;
        }

        token.location.start = start;
        token.location.end = i - 1;
      } else if (str_len == 2 && strncmp(src + start, "--", 2) == 0 &&
                 (hf__is_space_like(src[i]) || src[i] == '\0')) {
        token.type = HF__TOKEN_TYPE__DASH_COMMENT;

        start = ++i;
        while (src[i] != '\n' && i < src_len) {
          i++;
        }

        token.location.start = start;
        token.location.end = i - 1;
      } else {
      TOKEN_IS_WORD:
        token.type = HF__TOKEN_TYPE__WORD;
        token.location.start = start;
        token.location.end = i - 1;
      }
    }

    token_array_push(tokens, len, size, token);
  }
}