feat(dashboard): hand-rolled Rhai parser + symbol table + Vitest
Foundation for upcoming editor features (scope-aware autocomplete, goto-def / find-usages, source formatter). Hand-rolled recursive descent in TypeScript with Pratt precedence climbing for expressions, error-tolerant so partial trees stay usable while the user is typing. Symbol table walks the AST to produce per-scope declarations, usage sites, and object-literal field maps. Vitest added as a dev-only runner; no editor wiring in this commit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
248
dashboard/src/lib/rhai/lexer.ts
Normal file
248
dashboard/src/lib/rhai/lexer.ts
Normal file
@@ -0,0 +1,248 @@
|
||||
// Tokenizer for the dashboard's Rhai parser.
|
||||
//
|
||||
// Produces a flat array of tokens (eager — Rhai scripts in the dashboard
|
||||
// are small, 20–200 lines typical) plus a separate list of comments. The
|
||||
// parser only sees tokens; comments are handed to the formatter so it
|
||||
// can re-emit them at the right positions.
|
||||
//
|
||||
// Keyword and operator lists trace back to the upstream TextMate grammar
|
||||
// (rhaiscript/vscode-rhai). We don't copy any grammar bytes.
|
||||
|
||||
import type { Comment, Range } from './ast';
|
||||
|
||||
export type TokenKind =
|
||||
| 'Ident'
|
||||
| 'Keyword'
|
||||
| 'Number'
|
||||
| 'String'
|
||||
| 'Punct'
|
||||
| 'Operator'
|
||||
| 'EOF';
|
||||
|
||||
export interface Token extends Range {
|
||||
kind: TokenKind;
|
||||
// For Ident/Keyword/Punct/Operator: the literal source text. For
|
||||
// Number/String: the full literal including quotes.
|
||||
text: string;
|
||||
}
|
||||
|
||||
export const KEYWORDS = new Set([
|
||||
'let',
|
||||
'const',
|
||||
'fn',
|
||||
'if',
|
||||
'else',
|
||||
'while',
|
||||
'loop',
|
||||
'do',
|
||||
'for',
|
||||
'in',
|
||||
'return',
|
||||
'break',
|
||||
'continue',
|
||||
'switch',
|
||||
'case',
|
||||
'default',
|
||||
'true',
|
||||
'false',
|
||||
'null',
|
||||
'try',
|
||||
'catch',
|
||||
'throw',
|
||||
'as',
|
||||
'is',
|
||||
'private'
|
||||
]);
|
||||
|
||||
// Multi-char operators, longest first so the lexer picks them up greedily.
|
||||
const MULTI_CHAR_OPS = [
|
||||
'??=',
|
||||
'..=',
|
||||
'??',
|
||||
'..',
|
||||
'::',
|
||||
'==',
|
||||
'!=',
|
||||
'<=',
|
||||
'>=',
|
||||
'&&',
|
||||
'||',
|
||||
'<<',
|
||||
'>>',
|
||||
'+=',
|
||||
'-=',
|
||||
'*=',
|
||||
'/=',
|
||||
'%=',
|
||||
'=>',
|
||||
'->'
|
||||
];
|
||||
|
||||
const SINGLE_CHAR_OPS = new Set(['+', '-', '*', '/', '%', '<', '>', '!', '&', '|', '^', '~', '=', '?']);
|
||||
|
||||
// `#` is included so we can recognize the start of `#{` object-map literals;
|
||||
// the lexer emits it as a separate `Punct` and the parser combines it with
|
||||
// the following `{`.
|
||||
const PUNCTS = new Set(['(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '#']);
|
||||
|
||||
export interface LexResult {
|
||||
tokens: Token[];
|
||||
comments: Comment[];
|
||||
}
|
||||
|
||||
export function tokenize(source: string): LexResult {
|
||||
const tokens: Token[] = [];
|
||||
const comments: Comment[] = [];
|
||||
let i = 0;
|
||||
const n = source.length;
|
||||
|
||||
while (i < n) {
|
||||
const ch = source[i];
|
||||
|
||||
// Whitespace
|
||||
if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Line comment
|
||||
if (ch === '/' && source[i + 1] === '/') {
|
||||
const start = i;
|
||||
while (i < n && source[i] !== '\n') i++;
|
||||
comments.push({ kind: 'LineComment', start, end: i, text: source.slice(start, i) });
|
||||
continue;
|
||||
}
|
||||
|
||||
// Block comment (supports nesting per the Rhai book)
|
||||
if (ch === '/' && source[i + 1] === '*') {
|
||||
const start = i;
|
||||
i += 2;
|
||||
let depth = 1;
|
||||
while (i < n && depth > 0) {
|
||||
if (source[i] === '/' && source[i + 1] === '*') {
|
||||
depth++;
|
||||
i += 2;
|
||||
} else if (source[i] === '*' && source[i + 1] === '/') {
|
||||
depth--;
|
||||
i += 2;
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
comments.push({ kind: 'BlockComment', start, end: i, text: source.slice(start, i) });
|
||||
continue;
|
||||
}
|
||||
|
||||
// Strings: " ... " (escape-aware, single-line by convention) and
|
||||
// ` ... ` (raw, multi-line). We tokenize the entire literal including
|
||||
// quotes; the parser only cares about its position and text.
|
||||
if (ch === '"' || ch === '`') {
|
||||
const quote = ch;
|
||||
const start = i;
|
||||
i++;
|
||||
while (i < n) {
|
||||
const c = source[i];
|
||||
if (c === '\\' && quote === '"') {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
if (c === quote) {
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
tokens.push({ kind: 'String', start, end: i, text: source.slice(start, i) });
|
||||
continue;
|
||||
}
|
||||
|
||||
// Numbers: hex, binary, decimal, optional `.frac`, optional exponent.
|
||||
// Underscores are allowed as digit separators per Rhai.
|
||||
if (isDigit(ch)) {
|
||||
const start = i;
|
||||
if (ch === '0' && (source[i + 1] === 'x' || source[i + 1] === 'X')) {
|
||||
i += 2;
|
||||
while (i < n && (isHexDigit(source[i]) || source[i] === '_')) i++;
|
||||
} else if (ch === '0' && (source[i + 1] === 'b' || source[i + 1] === 'B')) {
|
||||
i += 2;
|
||||
while (i < n && (source[i] === '0' || source[i] === '1' || source[i] === '_')) i++;
|
||||
} else {
|
||||
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
|
||||
if (source[i] === '.' && isDigit(source[i + 1])) {
|
||||
i++;
|
||||
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
|
||||
}
|
||||
if (source[i] === 'e' || source[i] === 'E') {
|
||||
i++;
|
||||
if (source[i] === '+' || source[i] === '-') i++;
|
||||
while (i < n && isDigit(source[i])) i++;
|
||||
}
|
||||
}
|
||||
tokens.push({ kind: 'Number', start, end: i, text: source.slice(start, i) });
|
||||
continue;
|
||||
}
|
||||
|
||||
// Identifier or keyword
|
||||
if (isIdentStart(ch)) {
|
||||
const start = i;
|
||||
i++;
|
||||
while (i < n && isIdentCont(source[i])) i++;
|
||||
const text = source.slice(start, i);
|
||||
tokens.push({
|
||||
kind: KEYWORDS.has(text) ? 'Keyword' : 'Ident',
|
||||
start,
|
||||
end: i,
|
||||
text
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Multi-char operators
|
||||
let matched = false;
|
||||
for (const op of MULTI_CHAR_OPS) {
|
||||
if (source.startsWith(op, i)) {
|
||||
tokens.push({ kind: 'Operator', start: i, end: i + op.length, text: op });
|
||||
i += op.length;
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (matched) continue;
|
||||
|
||||
// Single-char operator
|
||||
if (SINGLE_CHAR_OPS.has(ch)) {
|
||||
tokens.push({ kind: 'Operator', start: i, end: i + 1, text: ch });
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Punctuation
|
||||
if (PUNCTS.has(ch)) {
|
||||
tokens.push({ kind: 'Punct', start: i, end: i + 1, text: ch });
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Unrecognized: skip and let the parser report the gap if needed.
|
||||
i++;
|
||||
}
|
||||
|
||||
tokens.push({ kind: 'EOF', start: n, end: n, text: '' });
|
||||
return { tokens, comments };
|
||||
}
|
||||
|
||||
function isDigit(c: string): boolean {
|
||||
return c >= '0' && c <= '9';
|
||||
}
|
||||
|
||||
function isHexDigit(c: string): boolean {
|
||||
return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
function isIdentStart(c: string): boolean {
|
||||
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c === '_';
|
||||
}
|
||||
|
||||
function isIdentCont(c: string): boolean {
|
||||
return isIdentStart(c) || isDigit(c);
|
||||
}
|
||||
Reference in New Issue
Block a user