Build a Tiny Compiler
Goal: Build a compiler for a simple language (arithmetic + variables + if/while) that targets x64 assembly. Understand the full pipeline: lexer → parser → AST → code generation.
Based on: Writing a Simple Compiler in C and shecc self-hosting compiler approach
Prerequisites: C pointers and structs, understanding of assembly (basic), recursion
The Compilation Pipeline
Source Code
↓
[Lexer] → Tokens
↓
[Parser] → AST (Abstract Syntax Tree)
↓
[Code Generator] → x64 Assembly
↓
Assemble + Link → Executable
Step 1: Lexer (Tokenizer)
Split source into tokens:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
typedef enum {
TOKEN_EOF,
TOKEN_INT, // 'int'
TOKEN_IF, // 'if'
TOKEN_ELSE, // 'else'
TOKEN_WHILE, // 'while'
TOKEN_RETURN, // 'return'
TOKEN_NUM, // 42
TOKEN_ID, // foo
TOKEN_PLUS, // +
TOKEN_MINUS, // -
TOKEN_STAR, // *
TOKEN_SLASH, // /
TOKEN_EQ, // =
TOKEN_EQEQ, // ==
TOKEN_LPAREN, // (
TOKEN_RPAREN, // )
TOKEN_LBRACE, // {
TOKEN_RBRACE, // }
TOKEN_SEMI, // ;
} TokenType;
typedef struct {
TokenType type;
char *lexeme; // raw text
int value; // for numbers
} Token;
typedef struct {
char *input;
size_t pos;
size_t len;
} Lexer;
Lexer lexer_create(char *input) {
return (Lexer){.input = input, .pos = 0, .len = strlen(input)};
}
char peek(Lexer *l) {
if (l->pos >= l->len) return '\0';
return l->input[l->pos];
}
char next(Lexer *l) {
return l->input[l->pos++];
}
void skip_whitespace(Lexer *l) {
while (isspace(peek(l))) next(l);
}
Token next_token(Lexer *l) {
skip_whitespace(l);
char c = peek(l);
if (c == '\0') return (Token){TOKEN_EOF, "", 0};
// Two-character operators
if (c == '=' && l->input[l->pos + 1] == '=') {
next(l); next(l);
return (Token){TOKEN_EQEQ, "==", 0};
}
// Single character
switch (c) {
case '+': next(l); return (Token){TOKEN_PLUS, "+", 0};
case '-': next(l); return (Token){TOKEN_MINUS, "-", 0};
case '*': next(l); return (Token){TOKEN_STAR, "*", 0};
case '/': next(l); return (Token){TOKEN_SLASH, "/", 0};
case '=': next(l); return (Token){TOKEN_EQ, "=", 0};
case '(': next(l); return (Token){TOKEN_LPAREN, "(", 0};
case ')': next(l); return (Token){TOKEN_RPAREN, ")", 0};
case '{': next(l); return (Token){TOKEN_LBRACE, "{", 0};
case '}': next(l); return (Token){TOKEN_RBRACE, "}", 0};
case ';': next(l); return (Token){TOKEN_SEMI, ";", 0};
}
// Number
if (isdigit(c)) {
int val = 0;
while (isdigit(peek(l))) {
val = val * 10 + (next(l) - '0');
}
return (Token){TOKEN_NUM, "", val};
}
// Identifier or keyword
if (isalpha(c) || c == '_') {
size_t start = l->pos;
while (isalnum(peek(l)) || peek(l) == '_') next(l);
size_t len = l->pos - start;
char *name = strndup(l->input + start, len);
// Keywords
if (!strcmp(name, "int")) return (Token){TOKEN_INT, name, 0};
if (!strcmp(name, "if")) return (Token){TOKEN_IF, name, 0};
if (!strcmp(name, "else")) return (Token){TOKEN_ELSE, name, 0};
if (!strcmp(name, "while")) return (Token){TOKEN_WHILE, name, 0};
if (!strcmp(name, "return")) return (Token){TOKEN_RETURN, name, 0};
return (Token){TOKEN_ID, name, 0};
}
next(l);
return (Token){TOKEN_EOF, "", 0};
}Step 2: AST Nodes
typedef enum {
AST_NUM,
AST_VAR, // variable reference
AST_BINOP, // binary operation
AST_ASSIGN, // assignment
AST_IF,
AST_WHILE,
AST_RETURN,
AST_SEQ, // sequence (statement list)
AST_DECL, // variable declaration
} AstType;
typedef struct AST {
AstType type;
union {
int num; // AST_NUM
char *var; // AST_VAR
struct {
char op; // +, -, *, /, ==
struct AST *left;
struct AST *right;
} binop;
struct {
struct AST *cond;
struct AST *then;
struct AST *els; // may be NULL
} if_;
struct {
struct AST *cond;
struct AST *body;
} while_;
struct {
char *name;
struct AST *value;
} assign;
struct {
char *name;
} decl;
struct AST *ret;
struct {
struct AST *head;
struct AST *tail;
} seq;
};
} AST;Step 3: Recursive Descent Parser
Top-down parsing using grammar:
program → stmt*
stmt → "int" ID ";" | expr ";" | "return" expr ";" | if_stmt | while_stmt | "{" stmt* "}"
if_stmt → "if" "(" expr ")" stmt ("else" stmt)?
while_stmt → "while" "(" expr ")" stmt
expr → equality
equality → term (("==" | "!=") term)*
term → factor (("+" | "-") factor)*
factor → unary (("*" | "/") unary)*
unary → ("-" | "!") unary | primary
primary → NUM | ID | "(" expr ")"
typedef struct {
Token *tokens;
size_t pos;
size_t count;
} Parser;
AST *parse_expr(Parser *p);
AST *parse_statement(Parser *p);
AST *parse_primary(Parser *p) {
Token t = p->tokens[p->pos];
if (t.type == TOKEN_NUM) {
p->pos++;
AST *n = calloc(1, sizeof(AST));
n->type = AST_NUM;
n->num = t.value;
return n;
}
if (t.type == TOKEN_ID) {
p->pos++;
AST *n = calloc(1, sizeof(AST));
n->type = AST_VAR;
n->var = strdup(t.lexeme);
return n;
}
if (t.type == TOKEN_LPAREN) {
p->pos++;
AST *n = parse_expr(p);
if (p->tokens[p->pos].type == TOKEN_RPAREN) p->pos++;
return n;
}
fprintf(stderr, "Unexpected token: %d\n", t.type);
exit(1);
}
AST *parse_unary(Parser *p) {
Token t = p->tokens[p->pos];
if (t.type == TOKEN_MINUS) {
p->pos++;
AST *n = calloc(1, sizeof(AST));
n->type = AST_BINOP;
n->binop.op = '-';
n->binop.left = calloc(1, sizeof(AST));
n->binop.left->type = AST_NUM;
n->binop.left->num = 0;
n->binop.right = parse_unary(p);
return n;
}
return parse_primary(p);
}
AST *parse_factor(Parser *p) {
AST *left = parse_unary(p);
while (p->tokens[p->pos].type == TOKEN_STAR ||
p->tokens[p->pos].type == TOKEN_SLASH) {
Token t = p->tokens[p->pos++];
AST *n = calloc(1, sizeof(AST));
n->type = AST_BINOP;
n->binop.op = t.type == TOKEN_STAR ? '*' : '/';
n->binop.left = left;
n->binop.right = parse_unary(p);
left = n;
}
return left;
}
AST *parse_expr(Parser *p) {
return parse_factor(p); // For simplicity, just factor
}
AST *parse_statement(Parser *p) {
Token t = p->tokens[p->pos];
if (t.type == TOKEN_INT) {
p->pos++; // skip 'int'
char *name = strdup(p->tokens[p->pos].lexeme);
p->pos++; // skip ID
p->pos++; // skip ';'
AST *n = calloc(1, sizeof(AST));
n->type = AST_DECL;
n->decl.name = name;
return n;
}
if (t.type == TOKEN_ID && p->tokens[p->pos + 1].type == TOKEN_EQ) {
char *name = strdup(t.lexeme);
p->pos += 2; // skip ID and =
AST *n = calloc(1, sizeof(AST));
n->type = AST_ASSIGN;
n->assign.name = name;
n->assign.value = parse_expr(p);
p->pos++; // skip ';'
return n;
}
if (t.type == TOKEN_IF) {
p->pos++; // skip 'if'
p->pos++; // skip '('
AST *cond = parse_expr(p);
p->pos++; // skip ')'
AST *then = parse_statement(p);
AST *els = NULL;
if (p->tokens[p->pos].type == TOKEN_ELSE) {
p->pos++;
els = parse_statement(p);
}
AST *n = calloc(1, sizeof(AST));
n->type = AST_IF;
n->if_.cond = cond;
n->if_.then = then;
n->if_.els = els;
return n;
}
if (t.type == TOKEN_WHILE) {
p->pos++; // skip 'while'
p->pos++; // skip '('
AST *cond = parse_expr(p);
p->pos++; // skip ')'
AST *body = parse_statement(p);
AST *n = calloc(1, sizeof(AST));
n->type = AST_WHILE;
n->while_.cond = cond;
n->while_.body = body;
return n;
}
if (t.type == TOKEN_LBRACE) {
p->pos++; // skip '{'
AST *seq = calloc(1, sizeof(AST));
seq->type = AST_SEQ;
AST **tail = &seq->seq.head;
while (p->tokens[p->pos].type != TOKEN_RBRACE &&
p->tokens[p->pos].type != TOKEN_EOF) {
*tail = parse_statement(p);
tail = &(*tail)->seq.tail;
}
p->pos++; // skip '}'
return seq;
}
return parse_expr(p);
}Step 4: Code Generation (x64)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct {
FILE *out;
int label_count;
int stack_offset;
} Codegen;
char *reg64[] = {"rdi", "rsi", "rdx", "rcx", "r8", "r9"};
void cg_init(Codegen *cg, FILE *out) {
cg->out = out;
cg->label_count = 0;
cg->stack_offset = 0;
}
int alloc_stack(Codegen *cg, int size) {
cg->stack_offset += size;
return -cg->stack_offset;
}
void cg_expr(Codegen *cg, AST *ast);
void cg_number(Codegen *cg, int n) {
fprintf(cg->out, " mov rax, %d\n", n);
}
void cg_binop(Codegen *cg, char op, AST *left, AST *right) {
// Evaluate left into rax
cg_expr(cg, left);
fprintf(cg->out, " push rax\n");
// Evaluate right
cg_expr(cg, right);
fprintf(cg->out, " pop rdi\n");
switch (op) {
case '+': fprintf(cg->out, " add rax, rdi\n"); break;
case '-': fprintf(cg->out, " sub rax, rdi\n"); break;
case '*': fprintf(cg->out, " imul rax, rdi\n"); break;
case '/': fprintf(cg->out, " cqo\n idiv rdi\n"); break;
}
}
void cg_compare(Codegen *cg, AST *left, AST *right) {
cg_expr(cg, left);
fprintf(cg->out, " push rax\n");
cg_expr(cg, right);
fprintf(cg->out, " pop rdi\n");
fprintf(cg->out, " cmp rdi, rax\n");
}
void cg_expr(Codegen *cg, AST *ast) {
switch (ast->type) {
case AST_NUM:
cg_number(cg, ast->num);
break;
case AST_VAR:
// For simplicity, variables are global
fprintf(cg->out, " mov rax, [%s]\n", ast->var);
break;
case AST_BINOP:
cg_binop(cg, ast->binop.op, ast->binop.left, ast->binop.right);
break;
case AST_ASSIGN:
cg_expr(cg, ast->assign.value);
fprintf(cg->out, " mov [%s], rax\n", ast->assign.name);
break;
}
}
void cg_statement(Codegen *cg, AST *ast) {
switch (ast->type) {
case AST_DECL:
// Allocate global variable, zero-initialized
fprintf(cg->out, "%s: dq 0\n", ast->decl.name);
break;
case AST_ASSIGN:
cg_expr(cg, ast);
break;
case AST_IF: {
int else_label = cg->label_count++;
int end_label = cg->label_count++;
cg_compare(cg, ast->if_.cond->binop.left,
ast->if_.cond->binop.right);
fprintf(cg->out, " jne .L%d\n", else_label);
cg_statement(cg, ast->if_.then);
fprintf(cg->out, " jmp .L%d\n", end_label);
fprintf(cg->out, ".L%d:\n", else_label);
if (ast->if_.els) cg_statement(cg, ast->if_.els);
fprintf(cg->out, ".L%d:\n", end_label);
break;
}
case AST_WHILE: {
int start_label = cg->label_count++;
int end_label = cg->label_count++;
fprintf(cg->out, ".L%d:\n", start_label);
cg_compare(cg, ast->while_.cond->binop.left,
ast->while_.cond->binop.right);
fprintf(cg->out, " je .L%d\n", end_label);
cg_statement(cg, ast->while_.body);
fprintf(cg->out, " jmp .L%d\n", start_label);
fprintf(cg->out, ".L%d:\n", end_label);
break;
}
case AST_SEQ:
for (AST *s = ast->seq.head; s; s = s->seq.tail) {
cg_statement(cg, s);
}
break;
}
}
void cg_program(AST *ast) {
FILE *out = fopen("out.s", "w");
Codegen cg;
cg_init(&cg, out);
fprintf(out, " .global main\n");
fprintf(out, "main:\n");
cg_statement(&cg, ast);
fprintf(out, " ret\n");
fclose(out);
}Step 5: Putting It Together
int main(int argc, char **argv) {
// Read source file
FILE *f = fopen(argv[1], "r");
fseek(f, 0, SEEK_END);
long len = ftell(f);
fseek(f, 0, SEEK_SET);
char *src = malloc(len + 1);
fread(src, 1, len, f);
src[len] = '\0';
// Lex
Lexer lex = lexer_create(src);
Token tokens[1024];
size_t n = 0;
while ((tokens[n++] = next_token(&lex)).type != TOKEN_EOF);
// Parse
Parser p = {.tokens = tokens, .pos = 0, .count = n};
AST *ast = parse_statement(&p);
// Generate
cg_program(ast);
printf("Generated out.s\n");
return 0;
}Example input (test.c):
int x;
x = 10;
while(x) {
x = x - 1;
}
Compile and run:
gcc -o compiler compiler.c
./compiler test.c
as -o out.o out.s
gcc -o test out.o
./testKey Concepts
- Lexer: Character stream → Token stream. Pure pattern matching.
- Parser: Token stream → AST. Grammar-driven recursive descent.
- AST: Tree representation preserving operator precedence.
- Codegen: AST → Assembly. Each node type has a generation rule.
Grammar matters: The grammar in this tutorial is simplified. Real compilers need:
- Proper operator precedence (handed by grammar structure)
- Type checking
- Symbol table management
- Register allocation (we use a simple global variable approach)
Exercises
- Add
==comparison to expressions (needs comparison and conditional jump) - Add
else ifchains — extend the grammar and code generation - Add function calls — need calling convention, stack frame setup
- Add local variables — track stack offsets, build symbol table
- Target bytecode VM instead of x64 — simpler, portable
See Also
- C Language Essentials — C concepts this builds on
- 04 - Build a Mini Shell — parsing command strings, similar techniques
- Writing a Simple Compiler in C — similar project
- shecc — self-hosting educational C compiler for ARM/RISC-V
- Crafting Interpreters — excellent book on building interpreters (free online)