Build a Tiny Compiler

Goal: Build a compiler for a simple language (arithmetic + variables + if/while) that targets x64 assembly. Understand the full pipeline: lexer → parser → AST → code generation.

Based on: Writing a Simple Compiler in C and shecc self-hosting compiler approach

Prerequisites: C pointers and structs, understanding of assembly (basic), recursion


The Compilation Pipeline

Source Code
    ↓
[Lexer] → Tokens
    ↓
[Parser] → AST (Abstract Syntax Tree)
    ↓
[Code Generator] → x64 Assembly
    ↓
Assemble + Link → Executable

Step 1: Lexer (Tokenizer)

Split source into tokens:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
 
typedef enum {
    TOKEN_EOF,
    TOKEN_INT,       // 'int'
    TOKEN_IF,        // 'if'
    TOKEN_ELSE,      // 'else'
    TOKEN_WHILE,     // 'while'
    TOKEN_RETURN,    // 'return'
    TOKEN_NUM,       // 42
    TOKEN_ID,        // foo
    TOKEN_PLUS,      // +
    TOKEN_MINUS,     // -
    TOKEN_STAR,      // *
    TOKEN_SLASH,     // /
    TOKEN_EQ,        // =
    TOKEN_EQEQ,      // ==
    TOKEN_LPAREN,    // (
    TOKEN_RPAREN,    // )
    TOKEN_LBRACE,    // {
    TOKEN_RBRACE,    // }
    TOKEN_SEMI,      // ;
} TokenType;
 
typedef struct {
    TokenType type;
    char *lexeme;      // raw text
    int value;         // for numbers
} Token;
 
typedef struct {
    char *input;
    size_t pos;
    size_t len;
} Lexer;
 
Lexer lexer_create(char *input) {
    return (Lexer){.input = input, .pos = 0, .len = strlen(input)};
}
 
char peek(Lexer *l) {
    if (l->pos >= l->len) return '\0';
    return l->input[l->pos];
}
 
char next(Lexer *l) {
    return l->input[l->pos++];
}
 
void skip_whitespace(Lexer *l) {
    while (isspace(peek(l))) next(l);
}
 
Token next_token(Lexer *l) {
    skip_whitespace(l);
    
    char c = peek(l);
    if (c == '\0') return (Token){TOKEN_EOF, "", 0};
    
    // Two-character operators
    if (c == '=' && l->input[l->pos + 1] == '=') {
        next(l); next(l);
        return (Token){TOKEN_EQEQ, "==", 0};
    }
    
    // Single character
    switch (c) {
        case '+': next(l); return (Token){TOKEN_PLUS, "+", 0};
        case '-': next(l); return (Token){TOKEN_MINUS, "-", 0};
        case '*': next(l); return (Token){TOKEN_STAR, "*", 0};
        case '/': next(l); return (Token){TOKEN_SLASH, "/", 0};
        case '=': next(l); return (Token){TOKEN_EQ, "=", 0};
        case '(': next(l); return (Token){TOKEN_LPAREN, "(", 0};
        case ')': next(l); return (Token){TOKEN_RPAREN, ")", 0};
        case '{': next(l); return (Token){TOKEN_LBRACE, "{", 0};
        case '}': next(l); return (Token){TOKEN_RBRACE, "}", 0};
        case ';': next(l); return (Token){TOKEN_SEMI, ";", 0};
    }
    
    // Number
    if (isdigit(c)) {
        int val = 0;
        while (isdigit(peek(l))) {
            val = val * 10 + (next(l) - '0');
        }
        return (Token){TOKEN_NUM, "", val};
    }
    
    // Identifier or keyword
    if (isalpha(c) || c == '_') {
        size_t start = l->pos;
        while (isalnum(peek(l)) || peek(l) == '_') next(l);
        
        size_t len = l->pos - start;
        char *name = strndup(l->input + start, len);
        
        // Keywords
        if (!strcmp(name, "int")) return (Token){TOKEN_INT, name, 0};
        if (!strcmp(name, "if")) return (Token){TOKEN_IF, name, 0};
        if (!strcmp(name, "else")) return (Token){TOKEN_ELSE, name, 0};
        if (!strcmp(name, "while")) return (Token){TOKEN_WHILE, name, 0};
        if (!strcmp(name, "return")) return (Token){TOKEN_RETURN, name, 0};
        
        return (Token){TOKEN_ID, name, 0};
    }
    
    next(l);
    return (Token){TOKEN_EOF, "", 0};
}

Step 2: AST Nodes

typedef enum {
    AST_NUM,
    AST_VAR,        // variable reference
    AST_BINOP,      // binary operation
    AST_ASSIGN,     // assignment
    AST_IF,
    AST_WHILE,
    AST_RETURN,
    AST_SEQ,        // sequence (statement list)
    AST_DECL,       // variable declaration
} AstType;
 
typedef struct AST {
    AstType type;
    union {
        int num;                           // AST_NUM
        char *var;                         // AST_VAR
        struct {
            char op;                       // +, -, *, /, ==
            struct AST *left;
            struct AST *right;
        } binop;
        struct {
            struct AST *cond;
            struct AST *then;
            struct AST *els;  // may be NULL
        } if_;
        struct {
            struct AST *cond;
            struct AST *body;
        } while_;
        struct {
            char *name;
            struct AST *value;
        } assign;
        struct {
            char *name;
        } decl;
        struct AST *ret;
        struct {
            struct AST *head;
            struct AST *tail;
        } seq;
    };
} AST;

Step 3: Recursive Descent Parser

Top-down parsing using grammar:

program    → stmt*
stmt       → "int" ID ";" | expr ";" | "return" expr ";" | if_stmt | while_stmt | "{" stmt* "}"
if_stmt    → "if" "(" expr ")" stmt ("else" stmt)?
while_stmt → "while" "(" expr ")" stmt
expr       → equality
equality   → term (("==" | "!=") term)*
term       → factor (("+" | "-") factor)*
factor     → unary (("*" | "/") unary)*
unary      → ("-" | "!") unary | primary
primary    → NUM | ID | "(" expr ")"
typedef struct {
    Token *tokens;
    size_t pos;
    size_t count;
} Parser;
 
AST *parse_expr(Parser *p);
AST *parse_statement(Parser *p);
 
AST *parse_primary(Parser *p) {
    Token t = p->tokens[p->pos];
    
    if (t.type == TOKEN_NUM) {
        p->pos++;
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_NUM;
        n->num = t.value;
        return n;
    }
    
    if (t.type == TOKEN_ID) {
        p->pos++;
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_VAR;
        n->var = strdup(t.lexeme);
        return n;
    }
    
    if (t.type == TOKEN_LPAREN) {
        p->pos++;
        AST *n = parse_expr(p);
        if (p->tokens[p->pos].type == TOKEN_RPAREN) p->pos++;
        return n;
    }
    
    fprintf(stderr, "Unexpected token: %d\n", t.type);
    exit(1);
}
 
AST *parse_unary(Parser *p) {
    Token t = p->tokens[p->pos];
    
    if (t.type == TOKEN_MINUS) {
        p->pos++;
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_BINOP;
        n->binop.op = '-';
        n->binop.left = calloc(1, sizeof(AST));
        n->binop.left->type = AST_NUM;
        n->binop.left->num = 0;
        n->binop.right = parse_unary(p);
        return n;
    }
    
    return parse_primary(p);
}
 
AST *parse_factor(Parser *p) {
    AST *left = parse_unary(p);
    
    while (p->tokens[p->pos].type == TOKEN_STAR || 
           p->tokens[p->pos].type == TOKEN_SLASH) {
        Token t = p->tokens[p->pos++];
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_BINOP;
        n->binop.op = t.type == TOKEN_STAR ? '*' : '/';
        n->binop.left = left;
        n->binop.right = parse_unary(p);
        left = n;
    }
    
    return left;
}
 
AST *parse_expr(Parser *p) {
    return parse_factor(p);  // For simplicity, just factor
}
 
AST *parse_statement(Parser *p) {
    Token t = p->tokens[p->pos];
    
    if (t.type == TOKEN_INT) {
        p->pos++;  // skip 'int'
        char *name = strdup(p->tokens[p->pos].lexeme);
        p->pos++;  // skip ID
        p->pos++;  // skip ';'
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_DECL;
        n->decl.name = name;
        return n;
    }
    
    if (t.type == TOKEN_ID && p->tokens[p->pos + 1].type == TOKEN_EQ) {
        char *name = strdup(t.lexeme);
        p->pos += 2;  // skip ID and =
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_ASSIGN;
        n->assign.name = name;
        n->assign.value = parse_expr(p);
        p->pos++;  // skip ';'
        return n;
    }
    
    if (t.type == TOKEN_IF) {
        p->pos++;  // skip 'if'
        p->pos++;  // skip '('
        AST *cond = parse_expr(p);
        p->pos++;  // skip ')'
        AST *then = parse_statement(p);
        AST *els = NULL;
        if (p->tokens[p->pos].type == TOKEN_ELSE) {
            p->pos++;
            els = parse_statement(p);
        }
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_IF;
        n->if_.cond = cond;
        n->if_.then = then;
        n->if_.els = els;
        return n;
    }
    
    if (t.type == TOKEN_WHILE) {
        p->pos++;  // skip 'while'
        p->pos++;  // skip '('
        AST *cond = parse_expr(p);
        p->pos++;  // skip ')'
        AST *body = parse_statement(p);
        AST *n = calloc(1, sizeof(AST));
        n->type = AST_WHILE;
        n->while_.cond = cond;
        n->while_.body = body;
        return n;
    }
    
    if (t.type == TOKEN_LBRACE) {
        p->pos++;  // skip '{'
        AST *seq = calloc(1, sizeof(AST));
        seq->type = AST_SEQ;
        AST **tail = &seq->seq.head;
        
        while (p->tokens[p->pos].type != TOKEN_RBRACE && 
               p->tokens[p->pos].type != TOKEN_EOF) {
            *tail = parse_statement(p);
            tail = &(*tail)->seq.tail;
        }
        
        p->pos++;  // skip '}'
        return seq;
    }
    
    return parse_expr(p);
}

Step 4: Code Generation (x64)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
typedef struct {
    FILE *out;
    int label_count;
    int stack_offset;
} Codegen;
 
char *reg64[] = {"rdi", "rsi", "rdx", "rcx", "r8", "r9"};
 
void cg_init(Codegen *cg, FILE *out) {
    cg->out = out;
    cg->label_count = 0;
    cg->stack_offset = 0;
}
 
int alloc_stack(Codegen *cg, int size) {
    cg->stack_offset += size;
    return -cg->stack_offset;
}
 
void cg_expr(Codegen *cg, AST *ast);
 
void cg_number(Codegen *cg, int n) {
    fprintf(cg->out, "    mov rax, %d\n", n);
}
 
void cg_binop(Codegen *cg, char op, AST *left, AST *right) {
    // Evaluate left into rax
    cg_expr(cg, left);
    fprintf(cg->out, "    push rax\n");
    
    // Evaluate right
    cg_expr(cg, right);
    fprintf(cg->out, "    pop rdi\n");
    
    switch (op) {
        case '+': fprintf(cg->out, "    add rax, rdi\n"); break;
        case '-': fprintf(cg->out, "    sub rax, rdi\n"); break;
        case '*': fprintf(cg->out, "    imul rax, rdi\n"); break;
        case '/': fprintf(cg->out, "    cqo\n    idiv rdi\n"); break;
    }
}
 
void cg_compare(Codegen *cg, AST *left, AST *right) {
    cg_expr(cg, left);
    fprintf(cg->out, "    push rax\n");
    cg_expr(cg, right);
    fprintf(cg->out, "    pop rdi\n");
    fprintf(cg->out, "    cmp rdi, rax\n");
}
 
void cg_expr(Codegen *cg, AST *ast) {
    switch (ast->type) {
        case AST_NUM:
            cg_number(cg, ast->num);
            break;
            
        case AST_VAR:
            // For simplicity, variables are global
            fprintf(cg->out, "    mov rax, [%s]\n", ast->var);
            break;
            
        case AST_BINOP:
            cg_binop(cg, ast->binop.op, ast->binop.left, ast->binop.right);
            break;
            
        case AST_ASSIGN:
            cg_expr(cg, ast->assign.value);
            fprintf(cg->out, "    mov [%s], rax\n", ast->assign.name);
            break;
    }
}
 
void cg_statement(Codegen *cg, AST *ast) {
    switch (ast->type) {
        case AST_DECL:
            // Allocate global variable, zero-initialized
            fprintf(cg->out, "%s: dq 0\n", ast->decl.name);
            break;
            
        case AST_ASSIGN:
            cg_expr(cg, ast);
            break;
            
        case AST_IF: {
            int else_label = cg->label_count++;
            int end_label = cg->label_count++;
            
            cg_compare(cg, ast->if_.cond->binop.left, 
                            ast->if_.cond->binop.right);
            fprintf(cg->out, "    jne .L%d\n", else_label);
            cg_statement(cg, ast->if_.then);
            fprintf(cg->out, "    jmp .L%d\n", end_label);
            fprintf(cg->out, ".L%d:\n", else_label);
            if (ast->if_.els) cg_statement(cg, ast->if_.els);
            fprintf(cg->out, ".L%d:\n", end_label);
            break;
        }
        
        case AST_WHILE: {
            int start_label = cg->label_count++;
            int end_label = cg->label_count++;
            
            fprintf(cg->out, ".L%d:\n", start_label);
            cg_compare(cg, ast->while_.cond->binop.left,
                            ast->while_.cond->binop.right);
            fprintf(cg->out, "    je .L%d\n", end_label);
            cg_statement(cg, ast->while_.body);
            fprintf(cg->out, "    jmp .L%d\n", start_label);
            fprintf(cg->out, ".L%d:\n", end_label);
            break;
        }
        
        case AST_SEQ:
            for (AST *s = ast->seq.head; s; s = s->seq.tail) {
                cg_statement(cg, s);
            }
            break;
    }
}
 
void cg_program(AST *ast) {
    FILE *out = fopen("out.s", "w");
    Codegen cg;
    cg_init(&cg, out);
    
    fprintf(out, "    .global main\n");
    fprintf(out, "main:\n");
    
    cg_statement(&cg, ast);
    
    fprintf(out, "    ret\n");
    fclose(out);
}

Step 5: Putting It Together

int main(int argc, char **argv) {
    // Read source file
    FILE *f = fopen(argv[1], "r");
    fseek(f, 0, SEEK_END);
    long len = ftell(f);
    fseek(f, 0, SEEK_SET);
    char *src = malloc(len + 1);
    fread(src, 1, len, f);
    src[len] = '\0';
    
    // Lex
    Lexer lex = lexer_create(src);
    Token tokens[1024];
    size_t n = 0;
    while ((tokens[n++] = next_token(&lex)).type != TOKEN_EOF);
    
    // Parse
    Parser p = {.tokens = tokens, .pos = 0, .count = n};
    AST *ast = parse_statement(&p);
    
    // Generate
    cg_program(ast);
    
    printf("Generated out.s\n");
    return 0;
}

Example input (test.c):

int x;
x = 10;
while(x) {
    x = x - 1;
}

Compile and run:

gcc -o compiler compiler.c
./compiler test.c
as -o out.o out.s
gcc -o test out.o
./test

Key Concepts

  1. Lexer: Character stream → Token stream. Pure pattern matching.
  2. Parser: Token stream → AST. Grammar-driven recursive descent.
  3. AST: Tree representation preserving operator precedence.
  4. Codegen: AST → Assembly. Each node type has a generation rule.

Grammar matters: The grammar in this tutorial is simplified. Real compilers need:

  • Proper operator precedence (handed by grammar structure)
  • Type checking
  • Symbol table management
  • Register allocation (we use a simple global variable approach)

Exercises

  1. Add == comparison to expressions (needs comparison and conditional jump)
  2. Add else if chains — extend the grammar and code generation
  3. Add function calls — need calling convention, stack frame setup
  4. Add local variables — track stack offsets, build symbol table
  5. Target bytecode VM instead of x64 — simpler, portable

See Also