Week: 10
Points: 20
Time: ~5 hours
Deliverable: toolchain/compiler/tokenizer.py + toolchain/compiler/parser.py + parse tree output + diary/week-10.md
What you ship
toolchain/compiler/tokenizer.pytoolchain/compiler/parser.pycompiler/week-10/Square.vl— provided test source (see below)compiler/week-10/Square.xml— your parse tree outputcompiler/week-10/Square.xml.ref— reference parse tree (provided)diary/week-10.md
Lab 10.1: Write the tokenizer
Write toolchain/compiler/tokenizer.py. The tokenizer strips comments and yields tokens one at a time:
from enum import Enum, auto
from dataclasses import dataclass
class TokenType(Enum):
KEYWORD = auto()
SYMBOL = auto()
INTEGER = auto()
STRING = auto()
IDENTIFIER = auto()
KEYWORDS = {
'class', 'constructor', 'function', 'method', 'field', 'static',
'var', 'int', 'char', 'boolean', 'void', 'true', 'false', 'null',
'this', 'let', 'do', 'if', 'else', 'while', 'return',
}
SYMBOLS = set('{}()[].,;+-*/&|<>=~')
@dataclass
class Token:
type: TokenType
value: str
class Tokenizer:
def __init__(self, source: str):
self._text = self._strip_comments(source)
self._pos = 0
self._tokens: list[Token] = []
self._index = 0
self._tokenize()
def _strip_comments(self, src: str) -> str:
import re
src = re.sub(r'/\*.*?\*/', '', src, flags=re.DOTALL)
src = re.sub(r'//[^\n]*', '', src)
return src
def _tokenize(self):
text = self._text
i = 0
while i < len(text):
if text[i].isspace():
i += 1
elif text[i] in SYMBOLS:
self._tokens.append(Token(TokenType.SYMBOL, text[i]))
i += 1
elif text[i] == '"':
j = text.index('"', i + 1)
self._tokens.append(Token(TokenType.STRING, text[i+1:j]))
i = j + 1
elif text[i].isdigit():
j = i
while j < len(text) and text[j].isdigit():
j += 1
self._tokens.append(Token(TokenType.INTEGER, text[i:j]))
i = j
elif text[i].isalpha() or text[i] == '_':
j = i
while j < len(text) and (text[j].isalnum() or text[j] == '_'):
j += 1
word = text[i:j]
tt = TokenType.KEYWORD if word in KEYWORDS else TokenType.IDENTIFIER
self._tokens.append(Token(tt, word))
i = j
else:
i += 1
def has_more(self) -> bool: return self._index < len(self._tokens)
def peek(self) -> Token: return self._tokens[self._index]
def advance(self) -> Token:
t = self._tokens[self._index]; self._index += 1; return t
def expect(self, value: str) -> Token:
t = self.advance()
if t.value != value:
raise SyntaxError(f"Expected '{value}', got '{t.value}'")
return t
Verify by tokenizing Square.vl and printing each token type and value. The tokenizer must handle:
- Multi-line
/* */comments - Single-line
//comments - String literals (no escape sequences required)
- Integer constants 0-32767
- All VirtusLang keywords and symbols
Lab 10.2: Write the recursive-descent parser
Write toolchain/compiler/parser.py. The parser consumes the token stream and produces a parse tree of ParseNode objects:
from dataclasses import dataclass, field as dc_field
from typing import Optional
from tokenizer import Tokenizer, TokenType, Token
@dataclass
class ParseNode:
tag: str
children: list = dc_field(default_factory=list)
value: Optional[str] = None
def to_xml(self, indent=0) -> str:
pad = ' ' * indent
if self.value is not None:
return f"{pad}<{self.tag}> {self.value} </{self.tag}>"
lines = [f"{pad}<{self.tag}>"]
for child in self.children:
lines.append(child.to_xml(indent + 1))
lines.append(f"{pad}</{self.tag}>")
return '\n'.join(lines)
The parser implements one method per grammar rule. Key rules:
class Parser:
def __init__(self, tok: Tokenizer):
self.tok = tok
def _expect(self, value: str) -> ParseNode:
t = self.tok.expect(value)
return ParseNode(t.type.name.lower(), value=t.value)
def _advance_node(self) -> ParseNode:
t = self.tok.advance()
return ParseNode(t.type.name.lower(), value=t.value)
def parse_class(self) -> ParseNode:
node = ParseNode('class')
node.children.append(self._expect('class'))
node.children.append(self._advance_node()) # class name
node.children.append(self._expect('{'))
while self.tok.peek().value in ('static', 'field'):
node.children.append(self.parse_class_var_dec())
while self.tok.peek().value in ('constructor', 'function', 'method'):
node.children.append(self.parse_subroutine_dec())
node.children.append(self._expect('}'))
return node
def parse_class_var_dec(self) -> ParseNode:
node = ParseNode('classVarDec')
node.children.append(self._advance_node()) # static | field
node.children.append(self._advance_node()) # type
node.children.append(self._advance_node()) # varName
while self.tok.peek().value == ',':
node.children.append(self._expect(','))
node.children.append(self._advance_node())
node.children.append(self._expect(';'))
return node
def parse_subroutine_dec(self) -> ParseNode:
node = ParseNode('subroutineDec')
node.children.append(self._advance_node()) # constructor|function|method
node.children.append(self._advance_node()) # void | type
node.children.append(self._advance_node()) # subroutine name
node.children.append(self._expect('('))
node.children.append(self.parse_parameter_list())
node.children.append(self._expect(')'))
node.children.append(self.parse_subroutine_body())
return node
def parse_parameter_list(self) -> ParseNode:
node = ParseNode('parameterList')
if self.tok.peek().value != ')':
node.children.append(self._advance_node()) # type
node.children.append(self._advance_node()) # name
while self.tok.peek().value == ',':
node.children.append(self._expect(','))
node.children.append(self._advance_node())
node.children.append(self._advance_node())
return node
def parse_subroutine_body(self) -> ParseNode:
node = ParseNode('subroutineBody')
node.children.append(self._expect('{'))
while self.tok.peek().value == 'var':
node.children.append(self.parse_var_dec())
node.children.append(self.parse_statements())
node.children.append(self._expect('}'))
return node
def parse_var_dec(self) -> ParseNode:
node = ParseNode('varDec')
node.children.append(self._expect('var'))
node.children.append(self._advance_node()) # type
node.children.append(self._advance_node()) # varName
while self.tok.peek().value == ',':
node.children.append(self._expect(','))
node.children.append(self._advance_node())
node.children.append(self._expect(';'))
return node
def parse_statements(self) -> ParseNode:
node = ParseNode('statements')
stmt_kw = {'let', 'if', 'while', 'do', 'return'}
while self.tok.has_more() and self.tok.peek().value in stmt_kw:
kw = self.tok.peek().value
if kw == 'let': node.children.append(self.parse_let())
elif kw == 'if': node.children.append(self.parse_if())
elif kw == 'while': node.children.append(self.parse_while())
elif kw == 'do': node.children.append(self.parse_do())
elif kw == 'return': node.children.append(self.parse_return())
return node
def parse_let(self) -> ParseNode:
node = ParseNode('letStatement')
node.children.append(self._expect('let'))
node.children.append(self._advance_node()) # varName
if self.tok.peek().value == '[':
node.children.append(self._expect('['))
node.children.append(self.parse_expression())
node.children.append(self._expect(']'))
node.children.append(self._expect('='))
node.children.append(self.parse_expression())
node.children.append(self._expect(';'))
return node
def parse_if(self) -> ParseNode:
node = ParseNode('ifStatement')
node.children.append(self._expect('if'))
node.children.append(self._expect('('))
node.children.append(self.parse_expression())
node.children.append(self._expect(')'))
node.children.append(self._expect('{'))
node.children.append(self.parse_statements())
node.children.append(self._expect('}'))
if self.tok.has_more() and self.tok.peek().value == 'else':
node.children.append(self._expect('else'))
node.children.append(self._expect('{'))
node.children.append(self.parse_statements())
node.children.append(self._expect('}'))
return node
def parse_while(self) -> ParseNode:
node = ParseNode('whileStatement')
node.children.append(self._expect('while'))
node.children.append(self._expect('('))
node.children.append(self.parse_expression())
node.children.append(self._expect(')'))
node.children.append(self._expect('{'))
node.children.append(self.parse_statements())
node.children.append(self._expect('}'))
return node
def parse_do(self) -> ParseNode:
node = ParseNode('doStatement')
node.children.append(self._expect('do'))
node.children.append(self._advance_node()) # subroutineName or className
if self.tok.peek().value == '.':
node.children.append(self._expect('.'))
node.children.append(self._advance_node())
node.children.append(self._expect('('))
node.children.append(self.parse_expression_list())
node.children.append(self._expect(')'))
node.children.append(self._expect(';'))
return node
def parse_return(self) -> ParseNode:
node = ParseNode('returnStatement')
node.children.append(self._expect('return'))
if self.tok.peek().value != ';':
node.children.append(self.parse_expression())
node.children.append(self._expect(';'))
return node
def parse_expression(self) -> ParseNode:
node = ParseNode('expression')
node.children.append(self.parse_term())
ops = set('+-*/&|<>=')
while self.tok.has_more() and self.tok.peek().value in ops:
node.children.append(self._advance_node())
node.children.append(self.parse_term())
return node
def parse_term(self) -> ParseNode:
node = ParseNode('term')
tok = self.tok.peek()
if tok.type == TokenType.INTEGER:
node.children.append(self._advance_node())
elif tok.type == TokenType.STRING:
node.children.append(self._advance_node())
elif tok.value in ('true', 'false', 'null', 'this'):
node.children.append(self._advance_node())
elif tok.value == '(':
node.children.append(self._expect('('))
node.children.append(self.parse_expression())
node.children.append(self._expect(')'))
elif tok.value in ('-', '~'):
node.children.append(self._advance_node())
node.children.append(self.parse_term())
else:
# identifier: varName, varName[expr], subroutineCall
node.children.append(self._advance_node())
if self.tok.has_more() and self.tok.peek().value == '[':
node.children.append(self._expect('['))
node.children.append(self.parse_expression())
node.children.append(self._expect(']'))
elif self.tok.has_more() and self.tok.peek().value in ('.', '('):
if self.tok.peek().value == '.':
node.children.append(self._expect('.'))
node.children.append(self._advance_node())
node.children.append(self._expect('('))
node.children.append(self.parse_expression_list())
node.children.append(self._expect(')'))
return node
def parse_expression_list(self) -> ParseNode:
node = ParseNode('expressionList')
if self.tok.peek().value != ')':
node.children.append(self.parse_expression())
while self.tok.peek().value == ',':
node.children.append(self._expect(','))
node.children.append(self.parse_expression())
return node
Lab 10.3: Parse Square.vl and compare against the reference
Create compiler/week-10/Square.vl:
class Square {
field int x, y;
field int size;
constructor Square new(int Ax, int Ay, int Asize) {
let x = Ax;
let y = Ay;
let size = Asize;
do draw();
return this;
}
method void dispose() {
do Memory.deAlloc(this);
return;
}
method void draw() {
do Screen.setColor(true);
do Screen.drawRectangle(x, y, x + size, y + size);
return;
}
method void moveUp() {
if (y > 1) {
do Screen.setColor(false);
do Screen.drawRectangle(x, (y + size) - 1, x + size, y + size);
let y = y - 2;
do Screen.setColor(true);
do Screen.drawRectangle(x, y, x + size, y + 1);
}
return;
}
}
Run the parser:
python3 -c "
from tokenizer import Tokenizer
from parser import Parser
src = open('compiler/week-10/Square.vl').read()
tree = Parser(Tokenizer(src)).parse_class()
print(tree.to_xml())
" > compiler/week-10/Square.xml
Compare against the reference XML (provided in worksheets/csa-110/lab10_square_ref.xml):
diff compiler/week-10/Square.xml worksheets/csa-110/lab10_square_ref.xml
Your output must match the reference exactly — no extra whitespace, no missing nodes. If diff shows differences, trace through the grammar rule that produced the mismatch.
Lab 10.4: Seeded error drill — syntax error recovery
Introduce a deliberate syntax error in Square.vl: remove the closing } of the draw method. Run the parser and verify it raises a SyntaxError with a useful message (not an unhelpful Python stack trace).
Add error handling to your parser that reports the line number (estimated from token position) when possible.
Lab 10.5: Toolchain Diary — CSA-102 comparison
In diary/week-10.md, write two paragraphs:
Paragraph 1: Compare your VirtusLang tokenizer to the Py6502v tokenizer from CSA-102. Identify one structural difference between the two (there is at least one). Explain why the difference exists given the language properties each tokenizer handles.
Paragraph 2: The recursive-descent parser has one method per grammar rule. The grammar for expression and term is mutually recursive (expression calls parse_term, parse_term can call parse_expression). Would a table-driven (LR) parser handle this mutual recursion differently? State whether mutual recursion is a feature or a limitation of the recursive-descent approach.
Grading
| Component | Points |
|---|---|
tokenizer.py: handles all token types, strips both comment forms |
6 |
parser.py: produces well-formed parse tree for Square.vl |
10 |
Square.xml diff against reference: zero differences |
2 |
| Toolchain Diary: CSA-102 comparison + mutual-recursion analysis | 2 |