December 11, 2019
December 08, 2019
Writing a lisp compiler from scratch in JavaScript: 6. an x86 upgrade
Previously in compiler basics:
<! forgive me, for I have sinned >
1. lisp to assembly
2. user-defined functions and variables
3. LLVM
4. LLVM conditionals and compiling fibonacci
5. LLVM system calls
This post upgrades the ulisp x86 backend from using a limited set of registers (with no spilling support) to solely using the stack to pass values between expressions.
This is a slightly longer post since we've got a lot of catchup to do to get to feature parity with the LLVM backend. Namely:
- "Infinite" locals, parameters
- Function definitions
- Variable references
- Arithmetic and logical operations
- If
- Syscalls
We'll tackle the first four points first and finish up with the last two. This way we can support the same fibonacci program that prints integers to stdout that we support in the LLVM backend.
As always the code is available on Github.
But first a digression into how this is suddenly easy for us to do with x86 and one-pass (sorta) code generation.
Stack-based languages
Stack-based languages have the extremely convenient attribute that values are (by default) stored on the stack, which allows a code generator targeting a stack-based language the option to omit handling register allocation. And as it happens, x86 has enough support to make it easy to treat as a stack machine.
As we build out the code generator for x86 as a stack machine we need to keep two commitments in mind:
- Every expression must pop all arguments/operands
- Every expression must store one result back on the stack
In the future, we may replace the second commitment. But for now it is more than enough.
Boilerplate
We'll start with the existing x86 backend code and strip all the implementation code:
const cp = require('child_process');
const fs = require('fs');
const os = require('os');
let GLOBAL_COUNTER = 0;
const SYSCALL_MAP = {
darwin: {
exit: '0x2000001',
write: '0x2000004',
},
linux: {
exit: 60,
write: 1,
},
}[os.platform()];
class Scope {}
class Compiler {
constructor() {
this.outBuffer = [];
this.primitiveFunctions = {
def: this.compileDefine.bind(this),
begin: this.compileBegin.bind(this),
if: this.compileIf.bind(this),
...this.prepareArithmeticWrappers(),
...this.prepareLogicalWrappers(),
...this.prepareSyscallWrappers(),
};
}
prepareArithmeticWrappers() {}
prepareLogicalWrappers() {}
prepareSyscallWrappers() {}
emit(depth, args) {
if (depth === undefined || args === undefined) {
throw new Error('Invalid call to emit');
}
const indent = new Array(depth + 1).join(' ');
this.outBuffer.push(indent + args);
}
compileExpression(arg, scope, depth) {}
compileIf([test, then, els], scope, depth) {}
compileBegin(body, scope, depth, topLevel = false) {}
compileDefine([name, params, ...body], scope, depth) {}
compileCall(fun, args, scope, depth) {}
emitPrefix() {
this.emit(1, '.global _main\n');
this.emit(1, '.text\n');
}
emitPostfix() {
this.emit(0, '_main:');
this.emit(1, 'CALL main');
this.emit(1, 'MOV RDI, RAX'); // Set exit arg
this.emit(1, `MOV RAX, ${SYSCALL_MAP['exit']}`);
this.emit(1, 'SYSCALL');
}
getOutput() {
const output = this.outBuffer.join('\n');
// Leave at most one empty line
return output.replace(/\n\n\n+/g, '\n\n');
}
}
module.exports.compile = function(ast) {
const c = new Compiler();
c.emitPrefix();
const s = new Scope();
c.compileBegin(ast, s, 1, true);
c.emitPostfix();
return c.getOutput();
};
module.exports.build = function(buildDir, program) {
const prog = 'prog';
fs.writeFileSync(`${buildDir}/${prog}.s`, program);
cp.execSync(
`gcc -mstackrealign -masm=intel -o ${buildDir}/${prog} ${buildDir}/${prog}.s`,
);
};
The prefix and postfix stays mostly the same as the original implementation. But we'll assume a couple of new helpers to get us in parity with the LLVM backend:
compileDefinecompileBegincompileIfcompileCallprepareArithmeticWrappersprepareLogicalWrappersprepareSyscallWrappers
The prepareArithmeticWrappers helper will define wrappers
for arithmetic instructions. The prepareLogicalWrappers
helper will define wrappers for logical instructions. And the
prepareSyscallWrappers helper will define a wrapper for
syscalls and generate builtins based on the SYSCALL_MAP entries.
Scope
Similar to our LLVM backend's Context and Scope helpers we'll define our own for the x86 backend. Since we're placing all locals on the stack, the two most important things Scope will do for us are:
- Map identifiers to escaped strings
- Store and increment the location of the local on the stack
Here's what it will look like:
class Scope {
constructor() {
this.localOffset = 1;
this.map = {};
}
assign(name) {
const safe = name.replace('-', '_');
this.map[safe] = this.localOffset++;
return safe;
}
symbol() {
return this.localOffset++;
}
lookup(name) {
const safe = name.replace('-', '_');
if (this.map[safe]) {
return { name: safe, offset: this.map[safe] };
}
return null;
}
copy() {
const s = new Scope();
// In the future we may need to store s.scopeOffset = this.scopeOffset + 1
// so we can read outer-scoped values at runtime.
s.map = { ...this.map };
return s;
}
}
compileExpression
An expression will be one of:
- A function call (possibly a builtin like
defor+) - A literal value (e.g.
29) - A reference (e.g.
&c) - An identifier (e.g.
my-var)
We'll handle compiling an expression in that order. If the AST
argument passed to compileExpression is an array, we will
call compileCall and return.
compileExpression(arg, scope, depth) {
// Is a nested function call, compile it
if (Array.isArray(arg)) {
this.compileCall(arg[0], arg.slice(1), scope, depth);
return;
}
}
If the AST is a number, we will push the number onto the stack and return.
compileExpression(arg, scope, depth) {
// Is a nested function call, compile it
if (Array.isArray(arg)) {
this.compileCall(arg[0], arg.slice(1), scope, depth);
return;
}
if (Number.isInteger(arg)) {
this.emit(depth, `PUSH ${arg}`);
return;
}
}
If the AST is a string that starts with & we will look up
the location of the identifier after the &, push its
location onto the stack and return.
We count on the Scope storing its offset from the "frame pointer",
which we will later set up to be stored in RBP.
Locals will be stored after the frame pointer and parameters will be stored before it. So we'll need to add or subtract from the frame pointer depending on if we need a positive or negative offset from it.
compileExpression(arg, scope, depth) {
// Is a nested function call, compile it
if (Array.isArray(arg)) {
this.compileCall(arg[0], arg.slice(1), scope, depth);
return;
}
if (Number.isInteger(arg)) {
this.emit(depth, `PUSH ${arg}`);
return;
}
if (arg.startsWith('&')) {
const { offset } = scope.lookup(arg.substring(1));
// Copy the frame pointer so we can return an offset from it
this.emit(depth, `MOV RAX, RBP`);
const operation = offset < 0 ? 'ADD' : 'SUB';
this.emit(depth, `${operation} RAX, ${Math.abs(offset * 8)} # ${arg}`);
this.emit(depth, `PUSH RAX`);
return;
}
}
Finally, we'll look up the identifier and copy the value (in its offset on the stack) to the top of the stack.
compileExpression(arg, scope, depth) {
// Is a nested function call, compile it
if (Array.isArray(arg)) {
this.compileCall(arg[0], arg.slice(1), scope, depth);
return;
}
if (Number.isInteger(arg)) {
this.emit(depth, `PUSH ${arg}`);
return;
}
if (arg.startsWith('&')) {
const { offset } = scope.lookup(arg.substring(1));
// Copy the frame pointer so we can return an offset from it
this.emit(depth, `MOV RAX, RBP`);
const operation = offset < 0 ? 'ADD' : 'SUB';
this.emit(depth, `${operation} RAX, ${Math.abs(offset * 8)} # ${arg}`);
this.emit(depth, `PUSH RAX`);
return;
}
// Variable lookup
const { offset } = scope.lookup(arg);
if (offset) {
const operation = offset < 0 ? '+' : '-';
this.emit(
depth,
`PUSH [RBP ${operation} ${Math.abs(offset * 8)}] # ${arg}`,
);
} else {
throw new Error(
'Attempt to reference undefined variable or unsupported literal: ' +
arg,
);
}
}
And that's it for handling expression! Let's add
compileCall support now that we referenced it.
compileCall
A call will first check if the call is a builtin. If so, it will immediately pass control to the builtin.
compileCall(fun, args, scope, depth) {
if (this.primitiveFunctions[fun]) {
this.primitiveFunctions[fun](args, scope, depth);
return;
}
}
Otherwise it will compile every argument to the call (which will leave all the resulting values on the stack.)
compileCall(fun, args, scope, depth) {
if (this.primitiveFunctions[fun]) {
this.primitiveFunctions[fun](args, scope, depth);
return;
}
// Compile registers and store on the stack
args.map((arg, i) => this.compileExpression(arg, scope, depth));
}
Then we will check that function is defined and call it.
compileCall(fun, args, scope, depth) {
if (this.primitiveFunctions[fun]) {
this.primitiveFunctions[fun](args, scope, depth);
return;
}
// Compile registers and store on the stack
args.map((arg, i) => this.compileExpression(arg, scope, depth));
const fn = scope.lookup(fun);
if (fn) {
this.emit(depth, `CALL ${fn.name}`);
} else {
throw new Error('Attempt to call undefined function: ' + fun);
}
}
Then we'll reset the stack pointer (to maintain our commitment) based
on the number of arguments and push RAX (where the return
result of the function call will be stored) onto the stack. We'll make
two minor optimizations for when there is exactly zero or one argument
to the function.
compileCall(fun, args, scope, depth) {
if (this.primitiveFunctions[fun]) {
this.primitiveFunctions[fun](args, scope, depth);
return;
}
// Compile registers and store on the stack
args.map((arg, i) => this.compileExpression(arg, scope, depth));
const fn = scope.lookup(fun);
if (fn) {
this.emit(depth, `CALL ${fn.name}`);
} else {
throw new Error('Attempt to call undefined function: ' + fun);
}
if (args.length > 1) {
// Drop the args
this.emit(depth, `ADD RSP, ${args.length * 8}`);
}
if (args.length === 1) {
this.emit(depth, `MOV [RSP], RAX\n`);
} else {
this.emit(depth, 'PUSH RAX\n');
}
}
When there is only one argument, we can just set the top value on the stack to be the return result of the call rather than resetting the stack pointer just to push onto it.
And that's it for compileCall! Now that we've got a feel
for expressions and function calls, let's add some simple arithmetic
operations.
prepareArithmeticWrappers
There are two major kind of arithmetic instructions we'll wrap for now:
- "General" instructions that operate on two arguments, putting the return result in the first argument
- "RAX" instructions that operate on RAX and the first argument,
putting the return result in
RAXand possiblyRDX
prepareGeneral
This helper will compile its two arguments and pop the second argument
into RAX. This is because x86 instructions typically
require one argument to be a register if one argument is allowed to be
a memory address.
We'll use the stack address as the first argument so 1) that non-commutative operations are correct and 2) the result is stored right back onto the stack in the right location.
const prepareGeneral = (instruction) => (arg, scope, depth) => {
depth++;
this.emit(depth, `# ${instruction.toUpperCase()}`);
// Compile first argument
this.compileExpression(arg[0], scope, depth);
// Compile second argument
this.compileExpression(arg[1], scope, depth);
this.emit(depth, `POP RAX`);
// Compile operation
this.emit(depth, `${instruction.toUpperCase()} [RSP], RAX`);
this.emit(depth, `# End ${instruction.toUpperCase()}`);
};
prepareRax
This helper will similarly compile its two arguments and pop
the second argument into RAX. But the RAX-implicit
instructions require the argument to be stored in a register
so we'll use the XCHG instruction to swap RAX
with the value on the top of the stack (the first argument).
const prepareRax = (instruction, outRegister = 'RAX') => (
arg,
scope,
depth,
) => {
depth++;
this.emit(depth, `# ${instruction.toUpperCase()}`);
// Compile first argument, store in RAX
this.compileExpression(arg[0], scope, depth);
// Compile second argument
this.compileExpression(arg[1], scope, depth);
// POP second argument and swap with first
this.emit(depth, `POP RAX`);
this.emit(depth, `XCHG [RSP], RAX`);
This may seem roundabout but remember that we must pop all arguments to the instruction to maintain our commitment.
Next we'll zero out the RDX register if the operation is
DIV, perform the operation, and store the result on the
top of the stack.
const prepareRax = (instruction, outRegister = 'RAX') => (
arg,
scope,
depth,
) => {
depth++;
this.emit(depth, `# ${instruction.toUpperCase()}`);
// Compile first argument, store in RAX
this.compileExpression(arg[0], scope, depth);
// Compile second argument
this.compileExpression(arg[1], scope, depth);
// POP second argument and swap with first
this.emit(depth, `POP RAX`);
this.emit(depth, `XCHG [RSP], RAX`);
// Reset RDX for DIV
if (instruction.toUpperCase() === 'DIV') {
this.emit(depth, `XOR RDX, RDX`);
}
// Compiler operation
this.emit(depth, `${instruction.toUpperCase()} QWORD PTR [RSP]`);
// Swap the top of the stack
this.emit(depth, `MOV [RSP], ${outRegister}`);
};
We parameterize the out register because the % wrapper
will call DIV but need RDX rather than
RAX after the operation.
prepareArithmeticWrappers
Putting everything together we get:
prepareArithmeticWrappers() {
// General operatations
const prepareGeneral = (instruction) => (arg, scope, depth) => {
depth++;
this.emit(depth, `# ${instruction.toUpperCase()}`);
// Compile first argument
this.compileExpression(arg[0], scope, depth);
// Compile second argument
this.compileExpression(arg[1], scope, depth);
this.emit(depth, `POP RAX`);
// Compile operation
this.emit(depth, `${instruction.toUpperCase()} [RSP], RAX`);
this.emit(depth, `# End ${instruction.toUpperCase()}`);
};
// Operations that use RAX implicitly
const prepareRax = (instruction, outRegister = 'RAX') => (
arg,
scope,
depth,
) => {
depth++;
this.emit(depth, `# ${instruction.toUpperCase()}`);
// Compile first argument, store in RAX
this.compileExpression(arg[0], scope, depth);
// Compile second argument
this.compileExpression(arg[1], scope, depth);
// POP second argument and swap with first
this.emit(depth, `POP RAX`);
this.emit(depth, `XCHG [RSP], RAX`);
// Reset RDX for DIV
if (instruction.toUpperCase() === 'DIV') {
this.emit(depth, `XOR RDX, RDX`);
}
// Compiler operation
this.emit(depth, `${instruction.toUpperCase()} QWORD PTR [RSP]`);
// Swap the top of the stack
this.emit(depth, `MOV [RSP], ${outRegister}`);
};
return {
'+': prepareGeneral('add'),
'-': prepareGeneral('sub'),
'&': prepareGeneral('and'),
'|': prepareGeneral('or'),
'=': prepareGeneral('mov'),
'*': prepareRax('mul'),
'/': prepareRax('div'),
'%': prepareRax('div', 'RDX'),
};
}
Next we'll tackle compileBegin and
compileDefine.
compileBegin
A begin form is an expression made up of a series of expressions where all expression values are thrown away and the last expression value is the result of the begin form.
To compile this form we will compile each expression passed in and pop from the stack to throw its value away. If the expression is the last in the list we will not pop since it is the result of the begin form.
We will add one exception to this popping logic: if the begin is called from the top-level we will omit the popping.
compileBegin(body, scope, depth, topLevel = false) {
body.forEach((expression, i) => {
this.compileExpression(expression, scope, depth);
if (!topLevel && i < body.length - 1) {
this.emit(depth, `POP RAX # Ignore non-final expression`);
}
});
}
That's it for compileBegin!
compileDefine
The prelude for a function definition will add its name to scope, push
the current frame pointer (RBP) onto the stack and store
the current stack pointer (RSP) as the new frame pointer
(RBP).
Remember that we use the frame pointer as a point of reference when setting and getting local and parameter values. It works out entirely by convention.
compileDefine([name, params, ...body], scope, depth) {
// Add this function to outer scope
const safe = scope.assign(name);
// Copy outer scope so parameter mappings aren't exposed in outer scope.
const childScope = scope.copy();
this.emit(0, `${safe}:`);
this.emit(depth, `PUSH RBP`);
this.emit(depth, `MOV RBP, RSP\n`);
}
Next we copy the parameters into local scope at their negative (from the frame pointer) location. In the future we may decide to actually copy in the parameter values into the local stack but for now there's no benefit.
compileDefine([name, params, ...body], scope, depth) {
// Add this function to outer scope
const safe = scope.assign(name);
// Copy outer scope so parameter mappings aren't exposed in outer scope.
const childScope = scope.copy();
this.emit(0, `${safe}:`);
this.emit(depth, `PUSH RBP`);
this.emit(depth, `MOV RBP, RSP\n`);
// Copy params into local scope
params.forEach((param, i) => {
childScope.map[param] = -1 * (params.length - i - 1 + 2);
});
}
Next we'll compile the body of the function within a
begin block.
compileDefine([name, params, ...body], scope, depth) {
// Add this function to outer scope
const safe = scope.assign(name);
// Copy outer scope so parameter mappings aren't exposed in outer scope.
const childScope = scope.copy();
this.emit(0, `${safe}:`);
this.emit(depth, `PUSH RBP`);
this.emit(depth, `MOV RBP, RSP\n`);
// Copy params into local scope
params.forEach((param, i) => {
childScope.map[param] = -1 * (params.length - i - 1 + 2);
});
// Pass childScope in for reference when body is compiled.
this.compileBegin(body, childScope, depth);
}
Then in the postlude we'll pop the stack (for the return result of the begin form), save it in RAX, pop the previous frame pointer back to restore the calling frame, and return.
compileDefine([name, params, ...body], scope, depth) {
// Add this function to outer scope
const safe = scope.assign(name);
// Copy outer scope so parameter mappings aren't exposed in outer scope.
const childScope = scope.copy();
this.emit(0, `${safe}:`);
this.emit(depth, `PUSH RBP`);
this.emit(depth, `MOV RBP, RSP\n`);
// Copy params into local scope
params.forEach((param, i) => {
childScope.map[param] = -1 * (params.length - i - 1 + 2);
});
// Pass childScope in for reference when body is compiled.
this.compileBegin(body, childScope, depth);
// Save the return value
this.emit(0, '');
this.emit(depth, `POP RAX`);
this.emit(depth, `POP RBP\n`);
this.emit(depth, 'RET\n');
November 30, 2019
Confusion and disengagement in meetings
The quickest way to cut through confusion or disagreement among otherwise amiable and honest folks is to ask questions.
Ask early so you don't waste time. But it's not enough to just ask clarifying questions because the answers won't always be clear.
Sounds like Human Interaction 101, and maybe it is. These techniques show up more when discussing outcomes and very rarely when discussing assumptions.
Meetings are called to discuss outcomes, not assumptions. But assumptions almost always need to be called into question too.
If you have clarity personally but you observe confusion and disengagement, ask questions and summarize. Someone must be aware of the group and be willing to sound dumb.
If you aren't aware of confusion or disengagement, start paying attention. Addressing doesn't need to be hard and is personally meaningful.
Address confusion and disengagement in meetings by asking questions and summarizing, whether you're confused or not. Question outcomes _and_ assumptions. https://t.co/2OPifEBSq5
— Phil Eaton (@phil_eaton) December 1, 2019
November 18, 2019
Tinybird Changelog: New User Experience for Data Exploration
November 05, 2019
Vitess 4.0 has been released!
October 14, 2019
The cron job that will speed up your Postgres queries 100x
The cron job that will speed up your Postgres queries 100x
October 12, 2019
Interpreting Go
After spending some time at work on tooling for keeping documentation in sync with Go struct definitions I had enough exposure to Go's built-in parsing package that next steps were clear: write an interpreter. It's a great way to get more comfortable with a language's AST.
In this post we'll use the Go parser package to interpret the AST directly (as opposed to compiling to a bytecode VM) with enough to support a recursive implementation of the fibonacci algorithm:
package main
func fib(a int) int {
if a == 1 {
return 0
}
if a == 2 {
return 1
}
return fib(a-1) + fib(a-2)
}
func main() {
println(fib(15))
}
You'll note this isn't actually valid Go because we are using an
undefined function println. We'll provide that for the
runtime to make things easier on ourselves.
The fibonacci algorithm is my goto minimal program that forces us to deal with basic aspects of:
- Function definitions
- Function calls
- Function arguments
- Function return values
- If/else
- Assignment
- Arithmetic and boolean operators
We'll do this in around 200 LoC. Project code is available on Github.
A follow-up post will cover support for an iterative fibonacci implementation with support for basic aspects of:
- Local variables
- Loops
First steps
I always start exploring an AST by practicing error-driven development. It's helpful to have the Go AST, parser, and token package docs handy as well.
We'll focus on single-file programs and start with
parser.ParseFile. This
function will return an
*ast.File. This in turn
contains a list of
Decls. Unfortunately Go stops
being helpful at this point because we have no clue what is going to
implement this Decl interface. So we'll switch on the
concrete type and error until we know what we need to know.
package main
import (
"go/ast"
"go/parser"
"go/token"
"io/ioutil"
"log"
"os"
"reflect"
)
func interpret(f *ast.File) {
for _, decl := range f.Decls {
switch d := decl.(type) {
default:
log.Fatalf("Unknown decl type (%s): %+v", reflect.TypeOf(d), d)
}
}
}
func main() {
fset := token.NewFileSet() // positions are relative to fset
src, err := ioutil.ReadFile(os.Args[1])
if err != nil {
log.Fatalf("Unable to read file: %s", err.Error())
}
f, err := parser.ParseFile(fset, os.Args[1], src, 0)
if err != nil {
log.Fatalf("Unable to parse file: %s", err.Error())
}
interpret(f)
}
Build and run:
$ go build goi.go
$ ./goi fib.go
2019/10/12 09:43:48 Unknown decl type (*ast.FuncDecl): &{Doc:<nil> Recv:<nil> Name:fib Type:0xc000096320 Body:0xc00009a3c0}
Cool! This is the declaration of the fib function and its
type is *ast.FuncDecl.
Interpreting ast.FuncDecl
A function declaration is going to need to add its name to a context
map, mapped to a function reference for use in function calls. Since
Go throws everything into the same context namespace this we can
simply pass around a map of strings to values where a
value can be any Go value. To facilitate this, we'll
define a value struct to hold an integer to represent
"kind" and an empty interface "value". When a value is referenced it
will have to switch on the "kind" and then cast the "value".
Additionally, and unlike a value-oriented language like Scheme, we'll
need to track a set of return values at all stages
through interpretation so, when set, we can short circuit execution.
type kind uint
const (
i64 kind = iota
fn
bl
)
type value struct {
kind kind
value interface{}
}
type context map[string]value
func (c context) copy() context {
cpy := context{}
for key, value := range c {
cpy[key] = value
}
return cpy
}
type ret struct {
set bool
vs []value
}
func (r *ret) setValues(vs []value) {
r.vs = vs
r.set = true
}
func interpretFuncDecl(ctx context, r *ret, fd *ast.FuncDecl) {
ctx[fd.Name.String()] = value{
fn,
func(ctx context, r *ret, args []value) {},
}
}
func interpret(ctx context, f *ast.File) {
for _, decl := range f.Decls {
switch d := decl.(type) {
case *ast.FuncDecl:
interpretFuncDecl(ctx, nil, d)
default:
log.Fatalf("Unknown decl type (%s): %+v", reflect.TypeOf(d), d)
}
}
}
Now that we have the idea of return management and contexts set out, let's fill out the actual function declaration callback. Inside we'll need to copy the context so variables declared inside the function are not visible outside. Then we'll iterate over the parameters and map them in context to the associated argument. Finally we'll interpret the body.
func interpretBlockStmt(ctx context, r *ret, fd *ast.BlockStmt) {}
func interpretFuncDecl(ctx context, r *ret, fd *ast.FuncDecl) {
ctx[fd.Name.String()] = value{
fn,
func(ctx context, r *ret, args []value) {
childCtx := ctx.copy()
for i, param := range fd.Type.Params.List {
childCtx[param.Names[0].String()] = args[i]
}
interpretBlockStmt(childCtx, r, fd.Body)
},
}
}
And we'll add a call to the interpreted main to the end
of the interpreter's main:
func main() {
fset := token.NewFileSet() // positions are relative to fset
src, err := ioutil.ReadFile(os.Args[1])
if err != nil {
log.Fatalf("Unable to read file: %s", err.Error())
}
f, err := parser.ParseFile(fset, os.Args[1], src, 0)
if err != nil {
log.Fatalf("Unable to parse file: %s", err.Error())
}
ctx := context{}
interpret(ctx, f)
var r ret
ctx["main"].value.(func(context, *ret, []value))(ctx, &r, []value{})
}
Next step!
Interpreting ast.BlockStmt
For this AST node, we'll iterate over each statement and interpret it. If the return value has been set we'll execute the loop to short circuit execution.
func interpretStmt(ctx context, r *ret, stmt ast.Stmt) {}
func interpretBlockStmt(ctx context, r *ret, bs *ast.BlockStmt) {
for _, stmt := range bs.List {
interpretStmt(ctx, r, stmt)
if r.set {
return
}
}
}
Next step!
Interpreting ast.Stmt
Since ast.Stmt is another interface, we're back to error-driven development.
func interpretStmt(ctx context, r *ret, stmt ast.Stmt) {
switch s := stmt.(type) {
default:
log.Fatalf("Unknown stmt type (%s): %+v", reflect.TypeOf(s), s)
}
}
And the trigger:
$ go build goi.go
$ ./goi fib.go
2019/10/12 10:15:14 Unknown stmt type (*ast.ExprStmt): &{X:0xc0000a02c0}
Great! Checking the docs on
ast.ExprStmt we'll just
skip directly to a call to a new function interpretExpr:
func interpretExpr(ctx context, r *ret, expr ast.Expr) {}
func interpretStmt(ctx context, r *ret, stmt ast.Stmt) {
switch s := stmt.(type) {
case *ast.ExprStmt:
interpretExpr(ctx, r, s.X)
default:
log.Fatalf("Unknown stmt type (%s): %+v", reflect.TypeOf(s), s)
}
}
Moving on!
Interpreting ast.Expr
We've got another interface. Let's error!
func interpretExpr(ctx context, r *ret, expr ast.Expr) {
switch e := expr.(type) {
default:
log.Fatalf("Unknown expr type (%s): %+v", reflect.TypeOf(e), e)
}
}
And the trigger:
$ go build goi.go
$ ./goi fib.go
2019/10/12 10:19:16 Unknown expr type (*ast.CallExpr): &{Fun:println Lparen:146 Args:[0xc0000a2280] Ellipsis:0 Rparen:154}
Cool! For a call we'll evaluate the arguments, evaluate the function expression itself, cast the resulting value to a function, and call it.
func interpretCallExpr(ctx context, r *ret, ce *ast.CallExpr) {
var fnr ret
interpretExpr(ctx, &fnr, ce.Fun)
fn := fnr.values[0]
values := []value{}
for _, arg := range ce.Args {
var vr ret
interpretExpr(ctx, &vr, arg)
values = append(values, vr.values[0])
}
fn.value.(func(context, *ret, []value))(ctx, r, values)
}
All of this casting is unsafe because we aren't doing a type-checking stage. But we can ignore this because if a type-checking stage were introduced (which it need to be at some point), it would prevent bad casts from happening.
Handling more ast.Expr implementations
Let's give the interpreter a shot again:
$ go build goi.go
$ ./goi fib.go
2019/10/12 10:28:17 Unknown expr type (*ast.Ident): println
We'll need to add ast.Ident
support to interpretCallExpr. This will be a simple
lookup in context. We'll also add a setValue helper since
the ret value is serving double-duty as a value passing
mechanism and also a function's return value (solely where multiple
value are a thing).
...
func (r *ret) setValue(v value) {
r.values = []value{v}
r.set = true
}
...
func interpretExpr(ctx context, r *ret, expr ast.Expr) {
switch e := expr.(type) {
case *ast.CallExpr:
interpretCallExpr(ctx, r, e)
case *ast.Ident:
r.setValue(ctx[e.Name])
default:
log.Fatalf("Unknown expr type (%s): %+v", reflect.TypeOf(e), e)
}
}
This is also a good time to add the println builtin to
our top-level context.
func main() {
...
ctx := context{}
interpret(ctx, f)
ctx["println"] = value{
fn,
func(ctx context, r *ret, args []value) {
var values []interface{}
for _, arg := range args {
values = append(values, arg.value)
}
fmt.Println(values...)
},
}
var r ret
ctx["main"].value.(func(context, *ret, []value))(ctx, &r, []value{})
}
More ast.Expr implementations
Running the interpreter again we get:
$ go build goi.go
$ ./goi fib.go
2019/10/12 10:41:59 Unknown expr type (*ast.BasicLit): &{ValuePos:151 Kind:INT Value:15}
Easy enough: we'll switch on the "kind" and parse a string int to an int and wrap it in our value type.
func interpretExpr(ctx context, r *ret, expr ast.Expr) {
switch e := expr.(type) {
case *ast.CallExpr:
interpretCallExpr(ctx, r, e)
case *ast.Ident:
r.setValue(ctx[e.Name])
case *ast.BasicLit:
switch e.Kind {
case token.INT:
i, _ := strconv.ParseInt(e.Value, 10, 64)
r.setValue(value{i64, i})
default:
log.Fatalf("Unknown basiclit type: %+v", e)
}
default:
log.Fatalf("Unknown expr type (%s): %+v", reflect.TypeOf(e), e)
}
}
Now we run again:
$ go build goi.go
$ ./goi fib.go
2019/10/12 10:48:46 Unknown stmt type (*ast.IfStmt): &{If:38 Init:<nil> Cond:0xc0000ac150 Body:0xc0000ac1b0 Else:<nil>}
Cool, more control flow!
Interpreting ast.IfStmt
For ast.IfStmt we interpret
the condition and, depending on the condition, interpret the body or
the else node. In order to make empty else interpreting easier, we'll
also add a nil short-circuit to interpretStmt.
func interpretIfStmt(ctx context, r *ret, is *ast.IfStmt) {
interpretStmt(ctx, nil, is.Init)
var cr ret
interpretExpr(ctx, &cr, is.Cond)
c := cr.valus[0]
if c.value.(bool) {
interpretBlockStmt(ctx, r, is.Body)
return
}
interpretStmt(ctx, r, is.Else)
}
func interpretStmt(ctx context, r *ret, stmt ast.Stmt) {
if stmt == nil {
return
}
switch s := stmt.(type) {
case *ast.IfStmt:
interpretIfStmt(ctx, r, s)
...
Let's try it out:
$ go build goi.go
$ ./goi fib.go
2019/10/12 10:56:28 Unknown expr type (*ast.BinaryExpr): &{X:a OpPos:43 Op:== Y:0xc00008a120}
Great!
Interpreting ast.BinaryExpr
An ast.BinaryExpr has an
Op field that we'll switch on to decide what operations
to do. We'll interpret the left side and then the right side and
finally perform the operation and return the result. The three binary
operations we use in this program are ==, +
and -. We'll look these up in go/token
docs to discover the
associated constants.
func interpretBinaryExpr(ctx context, r *ret, bexpr *ast.BinaryExpr) {
var xr, yr ret
interpretExpr(ctx, &xr, bexpr.X)
x := xr.values[0]
interpretExpr(ctx, &yr, bexpr.Y)
y := yr.values[0]
switch bexpr.Op {
case token.ADD:
r.setValue(value{i64, x.value.(int64) + y.value.(int64)})
case token.SUB:
r.setValue(value{i64, x.value.(int64) - y.value.(int64)})
case token.EQL:
r.setValue(value{bl, x.value.(int64) == y.value.(int64)})
default:
log.Fatalf("Unknown binary expression type: %+v", bexpr)
}
}
func interpretExpr(ctx context, r *ret, expr ast.Expr) {
switch e := expr.(type) {
case *ast.BinaryExpr:
interpretBinaryExpr(ctx, r, e)
...
Let's try one more time!
$ go build goi.go
$ ./goi fib.go
2019/10/12 11:06:19 Unknown stmt type (*ast.ReturnStmt): &{Return:94 Results:[0xc000070540]}
Awesome, last step.
Interpreting ast.ReturnStmt
Based on the
ast.ReturnStmt definition
we'll have to interpret each expression and set all of them to the
ret value.
func interpretReturnStmt(ctx context, r *ret, s *ast.ReturnStmt) {
var values []value
for _, expr := range s.Results {
var r ret
interpretExpr(ctx, &r, expr)
values = append(values, r.values[0])
}
r.setValues(values)
return
}
func interpretStmt(ctx context, r *ret, stmt ast.Stmt) {
if stmt == nil {
return
}
switch s := stmt.(type) {
case *ast.ReturnStmt:
interpretReturnStmt(ctx, r, s)
...
And let's try one last time:
$ go build goi.go
$ ./goi fib.go
377
Looking good. :) Let's try with another input:
$ cat fib.go
package main
func fib(a int) int {
if a == 1 {
return 0
}
if a == 2 {
return 1
}
return fib(a-1) + fib(a-2)
}
func main() {
println(fib(14))
}
$ ./goi fib.go
233
We've got the basics of an interpreter for Golang.
Here's a blog post on building a simple AST interpreter for Go to support running a recursive fibonacci implementation https://t.co/5Zz388d8ZN
— Phil Eaton (@phil_eaton) October 12, 2019