#!/usr/bin/python3 # -*- coding: utf-8 -*- """Run meta5ix abstract machine assembly. Invoke as: ./meta5ixrun.py meta5ix.generated.m5asm meta5ix.m5 This is part of the meta5ix project; meta5ix is the following parser generator written in itself, based on Val Schorre’s META II and some more recent inspirational work at VPRI: - program: ["-" name @it ":" terms {return}, "#" [:notin ""]] - terms: term ["," {continue $choice} term] @choice - term: (factor {else $seq}, output) [factor {assert}, output] @seq - factor: string {literal $it} , "(" terms ")" , "[" @many terms {continue $many} "]" , name {call $it} , "<<" {begin} terms ">>" {end} , ":fnord" {fnord} factor , ":notin" string {notin $it} , ":between" string {between $it} - output: (quasiquote, "@" {dedent} (var, quasiquote)) {writeline} - quasiquote: "{" [(<>, "\" <<:notin "">>) {say "$it"}, "$" var] "}" - ch: :notin "$}\" - var: "it" {copyinput}, name {gen $it} - string: :fnord <<'"' [:notin '"'] '"', "'" [:notin "'"] "'">> - name: :fnord <> - letter: :between "az", :between "AZ", :between "__" The above compiler compiles itself to the virtual machine instruction set interpreted by this program, which has only 17 orders, or instructions. The output is an assembly program that is 224 instructions long; it begins as follows: program many12 literal "-" else seq15 call name assert dedent copyinput writeline literal ":" As with META II, unindented lines are labels, and indented lines are instructions (hereafter “orders”), but order opcodes are separated from their argument by whitespace, not vertical alignment. Execution begins at the beginning of the program. The 17 orders are divided into four groups: subroutine control, logic, input, and output. In addition to its input and output streams, the machine has: - a program counter, initially at the beginning of the program; - a call counter, initially 0; - a success flag, initially set; - a stack, initially empty; - a token mark, initially undefined; - a last consumed token, initially undefined; and - an output buffer, which is a variable-length string that can be appended to, holding a partial line of output. The subroutine control orders use the stack but do not affect the success flag: - `call foo` pushes the program counter (the address of the following instruction) onto the stack, followed by the call counter. Then it increments the call counter and jumps to `foo`, which is to say that it sets the program counter to the address of the label `foo`. (The concrete implementation below packages both of these words into a single stack-frame object to simplify the implementation.) - `return` terminates the program successfully if the stack was empty; otherwise, it discards the saved call counter and pops the program counter off the stack. The logic orders alter control flow and the success flag according to the success flag: - `assert` halts with an error message if the success flag is clear. - `else foo` jumps to `foo` if the success flag is clear. The success flag remains unchanged. - `continue foo` jumps to `foo` if the success flag is *set*; otherwise, it sets the success flag. There are two input orders for setting the last consumed token: - `begin` sets a token-start mark in the input stream, cabbaging the last consumed token. - `end` sets the last consumed token to be all the text consumed from the input stream since the token-start mark set by `begin`, if the success flag is set, or backs up the input to the token-start mark, if the success flag is clear. It does not alter the success flag. There is a special input order for consuming whitespace: - `fnord` consumes as much whitespace, including line endings, as possible, but neither changes nor consults the success flag. There are three more input orders for consuming input. They set the success flag if they consume input, clearing it otherwise: - `notin "4x*"` consumes a character of input as long as that character is neither the line separator, '4', 'x', nor '*', and analogously for other argument strings. So `notin ""` consumes any character as long as it is not a newline. - `between "gn"` consumes a character of input as long as it is 'g', 'h', 'i', 'j', 'k', 'l', 'm', or 'n', and analogously for other two-character argument strings. - `literal "foo"` consumes whitespace in the same way as `fnord`, then the literal text `foo`. The output orders produce output as follows: - `writeline` outputs the output buffer, then sets it to ` `, a string consisting of 8 spaces. - `dedent` sets the output buffer to the empty string. - `say "foo"` emits `foo`, which is to say, it appends `foo` to the output buffer. - `gen foo` emits `foo` and the decimal representation of the top stack item — the saved call counter. - `copyinput` emits the last consumed token. This is an error if the last token failed and so the last consumed token is cabbaged. Retrospective ------------- It's been two years since I wrote Meta5ix, and although I'm still pretty pleased with it, I think there are some things I would change if I wrote it now: - I would use *(stuff) for repetition instead of [stuff], which misleadingly implies zero-or-one. - I would use | for alternation instead of ``,``, which misleadingly implies sequencing. - I would call :fnord :spacing. Similarly for fnord. - I would call "rand" "operand". - Maybe :between should be :matchrange. - I would call $it $tok or $copy, and of course the same for @it. copyinput also needs renaming. - I would handle string quoting differently: instead of two different choices of quote characters but no escaping, which means that strings containing both of the quotes can't be represented at all, I would have one kind of quotes, with escaping inside of them. Possible approaches to escaping include backslashing followed by a literal character (already used in quasiquote), ~ followed by a letter, or quote doubling. See meta5ix2c2.m5 for details on the problem. This requires a new primitive order that removes a character from the token being built; it's no longer strictly a substring of the input line. The ~q ~t approach would also require the ability to insert a character into the token. Alternatively maybe Meta5ix itself could fob off the responsibility for removing backslashes onto whatever the backend is? - I think "literal", "else", "call", "return", "writeline", "say", and "dedent" are good names. "assert", "gen", "begin", "end", "between", and "copyinput" could be clearer. "continue" is misleading; it should be something like "ifgood". - The m5asm would be clearer with colons on its labels. - The absence of an assembly-language implementation is pretty sad. It would be pretty cool to implement it using the macro system of gas or nasm, which probably would benefit from renaming "call" and maybe "else", and also from colons on labels. """ from __future__ import print_function import sys, pprint, string def assemble(lines): "Parse input assembly file into a (labels, insns) pair." labels = {} # A dict mapping label names to order indices. insns = [] # Orders (instructions). for line in lines: # Comments. if line.startswith(';'): continue # Instructions are indented. if line[:1] in ' \t\n': if line.strip(): insns.append(line.split(None, 1)) # Labels aren’t indented. else: labels[line.strip()] = len(insns) return labels, insns class ParseFailure(Exception): pass fresh_line = ' ' * 8 class ParsingMachine: def run(self, labels, insns, data): "Run the program in insns on the data in data using labels to jump." self.pc = 0 # Program counter; address of next instruction. self.labels = labels # Mapping from label names to PC values. self.insns = insns # Array of assembly instructions, indexed by PC. self.input = data # A potentially large string of input data. self.inpos = 0 # The position to which we have consumed it. self.stack = [(None, None, '(top)')] # (returnpc, callcount, name) self.success = True # Flag telling whether parse is currently failing. self.calls = 0 # Counter of all calls so far; never decreases. self.outbuf = fresh_line # Output buffer. # Returning from the top level sets pc to None while self.pc is not None: insn = self.insns[self.pc] debug('pc', self.pc, insn, self.inpos, repr(self.peek(5))) self.pc += 1 self.execute(insn[0], insn[1].strip() if len(insn) > 1 else None) def execute(self, op, rand): "Execute a single order, with operand rand." ### Subroutine orders. if op == 'call': self.stack.append((self.pc, self.calls, rand)) self.calls += 1 self.pc = self.labels[rand] elif op == 'return': self.pc, _, _ = self.stack.pop() ### Logic orders. elif op == 'else': if not self.success: self.pc = self.labels[rand] elif op == 'continue': if self.success: self.pc = self.labels[rand] self.success = True elif op == 'assert': if not self.success: raise ParseFailure(self.inpos, self.peek(10), self.backtrace()) ### Input orders. elif op == 'literal': self.fnord() s = rand[1:-1] # strip quotes self.consume_if(self.peek(len(s)) == s, n=len(s)) elif op == 'begin': self.token_start = self.inpos elif op == 'end': n = self.inpos - self.token_start self.inpos = self.token_start self.token = self.consume_if(self.success, n) del self.token_start elif op == 'fnord': self.fnord() elif op == 'notin': c = self.peek(1) self.consume_if(c and c != '\n' and c not in rand[1:-1]) elif op == 'between': c = self.peek(1) # rand is something like '"AZ"' self.consume_if(c and c != '\n' and rand[1] <= c <= rand[2]) ### Output orders. elif op == 'writeline': debug('writeline', repr(self.outbuf)) print(self.outbuf) self.outbuf = fresh_line elif op == 'dedent': self.outbuf = '' elif op == 'copyinput': self.outbuf += self.token elif op == 'say': self.outbuf += rand[1:-1] elif op == 'gen': self.outbuf += '%s%s' % (rand, self.stack[-1][1]) else: raise ValueError(op) def consume_if(self, go, n=1): "If go: succeed, consume n characters, and return them; else, fail." self.success = go if go: self.inpos += n return self.input[self.inpos-n:self.inpos] def peek(self, n): "Examine the next n characters without consuming them." return self.input[self.inpos:self.inpos+n] def fnord(self): "Skip over whitespace in input." while self.peek(1) and self.peek(1) in ' \t\n': self.inpos += 1 def backtrace(self): "Generate a backtrace so you can tell why a parse failed." # XXX this is subtracting the entry address from the wrong stack level return [("%s+%s" % (label, pc - entry), raw, self.insns[pc-2:pc+1]) for raw, pc, label, entry in [(pc, 0 if pc is None else pc, label, self.labels.get(label, 0)) for pc, _, label in self.stack]] def main(argv): with open(argv[2]) as inputfile: data = inputfile.read() with open(argv[1]) as m5asmfile: labels, insns = assemble(m5asmfile) # pprint.pprint(labels) # pprint.pprint(list(enumerate(insns))) ParsingMachine().run(labels, insns, data) debug = lambda *args: None if __name__ == '__main__': if sys.argv[1] == '-v': sys.argv.pop(1) debug = lambda *args: print(';', *args) main(sys.argv)