feat(dashboard): hand-rolled Rhai parser + symbol table + Vitest

Foundation for upcoming editor features (scope-aware autocomplete,
goto-def / find-usages, source formatter). Hand-rolled recursive
descent in TypeScript with Pratt precedence climbing for expressions,
error-tolerant so partial trees stay usable while the user is typing.
Symbol table walks the AST to produce per-scope declarations, usage
sites, and object-literal field maps. Vitest added as a dev-only
runner; no editor wiring in this commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-23 23:38:15 +02:00
parent a80e6d1ca4
commit bc8b512b56
11 changed files with 2361 additions and 3 deletions

View File

@@ -0,0 +1,248 @@
// Tokenizer for the dashboard's Rhai parser.
//
// Produces a flat array of tokens (eager — Rhai scripts in the dashboard
// are small, 20200 lines typical) plus a separate list of comments. The
// parser only sees tokens; comments are handed to the formatter so it
// can re-emit them at the right positions.
//
// Keyword and operator lists trace back to the upstream TextMate grammar
// (rhaiscript/vscode-rhai). We don't copy any grammar bytes.
import type { Comment, Range } from './ast';
export type TokenKind =
| 'Ident'
| 'Keyword'
| 'Number'
| 'String'
| 'Punct'
| 'Operator'
| 'EOF';
export interface Token extends Range {
kind: TokenKind;
// For Ident/Keyword/Punct/Operator: the literal source text. For
// Number/String: the full literal including quotes.
text: string;
}
export const KEYWORDS = new Set([
'let',
'const',
'fn',
'if',
'else',
'while',
'loop',
'do',
'for',
'in',
'return',
'break',
'continue',
'switch',
'case',
'default',
'true',
'false',
'null',
'try',
'catch',
'throw',
'as',
'is',
'private'
]);
// Multi-char operators, longest first so the lexer picks them up greedily.
const MULTI_CHAR_OPS = [
'??=',
'..=',
'??',
'..',
'::',
'==',
'!=',
'<=',
'>=',
'&&',
'||',
'<<',
'>>',
'+=',
'-=',
'*=',
'/=',
'%=',
'=>',
'->'
];
const SINGLE_CHAR_OPS = new Set(['+', '-', '*', '/', '%', '<', '>', '!', '&', '|', '^', '~', '=', '?']);
// `#` is included so we can recognize the start of `#{` object-map literals;
// the lexer emits it as a separate `Punct` and the parser combines it with
// the following `{`.
const PUNCTS = new Set(['(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '#']);
export interface LexResult {
tokens: Token[];
comments: Comment[];
}
export function tokenize(source: string): LexResult {
const tokens: Token[] = [];
const comments: Comment[] = [];
let i = 0;
const n = source.length;
while (i < n) {
const ch = source[i];
// Whitespace
if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') {
i++;
continue;
}
// Line comment
if (ch === '/' && source[i + 1] === '/') {
const start = i;
while (i < n && source[i] !== '\n') i++;
comments.push({ kind: 'LineComment', start, end: i, text: source.slice(start, i) });
continue;
}
// Block comment (supports nesting per the Rhai book)
if (ch === '/' && source[i + 1] === '*') {
const start = i;
i += 2;
let depth = 1;
while (i < n && depth > 0) {
if (source[i] === '/' && source[i + 1] === '*') {
depth++;
i += 2;
} else if (source[i] === '*' && source[i + 1] === '/') {
depth--;
i += 2;
} else {
i++;
}
}
comments.push({ kind: 'BlockComment', start, end: i, text: source.slice(start, i) });
continue;
}
// Strings: " ... " (escape-aware, single-line by convention) and
// ` ... ` (raw, multi-line). We tokenize the entire literal including
// quotes; the parser only cares about its position and text.
if (ch === '"' || ch === '`') {
const quote = ch;
const start = i;
i++;
while (i < n) {
const c = source[i];
if (c === '\\' && quote === '"') {
i += 2;
continue;
}
if (c === quote) {
i++;
break;
}
i++;
}
tokens.push({ kind: 'String', start, end: i, text: source.slice(start, i) });
continue;
}
// Numbers: hex, binary, decimal, optional `.frac`, optional exponent.
// Underscores are allowed as digit separators per Rhai.
if (isDigit(ch)) {
const start = i;
if (ch === '0' && (source[i + 1] === 'x' || source[i + 1] === 'X')) {
i += 2;
while (i < n && (isHexDigit(source[i]) || source[i] === '_')) i++;
} else if (ch === '0' && (source[i + 1] === 'b' || source[i + 1] === 'B')) {
i += 2;
while (i < n && (source[i] === '0' || source[i] === '1' || source[i] === '_')) i++;
} else {
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
if (source[i] === '.' && isDigit(source[i + 1])) {
i++;
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
}
if (source[i] === 'e' || source[i] === 'E') {
i++;
if (source[i] === '+' || source[i] === '-') i++;
while (i < n && isDigit(source[i])) i++;
}
}
tokens.push({ kind: 'Number', start, end: i, text: source.slice(start, i) });
continue;
}
// Identifier or keyword
if (isIdentStart(ch)) {
const start = i;
i++;
while (i < n && isIdentCont(source[i])) i++;
const text = source.slice(start, i);
tokens.push({
kind: KEYWORDS.has(text) ? 'Keyword' : 'Ident',
start,
end: i,
text
});
continue;
}
// Multi-char operators
let matched = false;
for (const op of MULTI_CHAR_OPS) {
if (source.startsWith(op, i)) {
tokens.push({ kind: 'Operator', start: i, end: i + op.length, text: op });
i += op.length;
matched = true;
break;
}
}
if (matched) continue;
// Single-char operator
if (SINGLE_CHAR_OPS.has(ch)) {
tokens.push({ kind: 'Operator', start: i, end: i + 1, text: ch });
i++;
continue;
}
// Punctuation
if (PUNCTS.has(ch)) {
tokens.push({ kind: 'Punct', start: i, end: i + 1, text: ch });
i++;
continue;
}
// Unrecognized: skip and let the parser report the gap if needed.
i++;
}
tokens.push({ kind: 'EOF', start: n, end: n, text: '' });
return { tokens, comments };
}
function isDigit(c: string): boolean {
return c >= '0' && c <= '9';
}
function isHexDigit(c: string): boolean {
return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
function isIdentStart(c: string): boolean {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c === '_';
}
function isIdentCont(c: string): boolean {
return isIdentStart(c) || isDigit(c);
}