Files
PiCloud/dashboard/src/lib/rhai/lexer.ts
MechaCat02 bc8b512b56 feat(dashboard): hand-rolled Rhai parser + symbol table + Vitest
Foundation for upcoming editor features (scope-aware autocomplete,
goto-def / find-usages, source formatter). Hand-rolled recursive
descent in TypeScript with Pratt precedence climbing for expressions,
error-tolerant so partial trees stay usable while the user is typing.
Symbol table walks the AST to produce per-scope declarations, usage
sites, and object-literal field maps. Vitest added as a dev-only
runner; no editor wiring in this commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 23:38:15 +02:00

249 lines
5.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Tokenizer for the dashboard's Rhai parser.
//
// Produces a flat array of tokens (eager — Rhai scripts in the dashboard
// are small, 20200 lines typical) plus a separate list of comments. The
// parser only sees tokens; comments are handed to the formatter so it
// can re-emit them at the right positions.
//
// Keyword and operator lists trace back to the upstream TextMate grammar
// (rhaiscript/vscode-rhai). We don't copy any grammar bytes.
import type { Comment, Range } from './ast';
export type TokenKind =
| 'Ident'
| 'Keyword'
| 'Number'
| 'String'
| 'Punct'
| 'Operator'
| 'EOF';
export interface Token extends Range {
kind: TokenKind;
// For Ident/Keyword/Punct/Operator: the literal source text. For
// Number/String: the full literal including quotes.
text: string;
}
export const KEYWORDS = new Set([
'let',
'const',
'fn',
'if',
'else',
'while',
'loop',
'do',
'for',
'in',
'return',
'break',
'continue',
'switch',
'case',
'default',
'true',
'false',
'null',
'try',
'catch',
'throw',
'as',
'is',
'private'
]);
// Multi-char operators, longest first so the lexer picks them up greedily.
const MULTI_CHAR_OPS = [
'??=',
'..=',
'??',
'..',
'::',
'==',
'!=',
'<=',
'>=',
'&&',
'||',
'<<',
'>>',
'+=',
'-=',
'*=',
'/=',
'%=',
'=>',
'->'
];
const SINGLE_CHAR_OPS = new Set(['+', '-', '*', '/', '%', '<', '>', '!', '&', '|', '^', '~', '=', '?']);
// `#` is included so we can recognize the start of `#{` object-map literals;
// the lexer emits it as a separate `Punct` and the parser combines it with
// the following `{`.
const PUNCTS = new Set(['(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '#']);
export interface LexResult {
tokens: Token[];
comments: Comment[];
}
export function tokenize(source: string): LexResult {
const tokens: Token[] = [];
const comments: Comment[] = [];
let i = 0;
const n = source.length;
while (i < n) {
const ch = source[i];
// Whitespace
if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') {
i++;
continue;
}
// Line comment
if (ch === '/' && source[i + 1] === '/') {
const start = i;
while (i < n && source[i] !== '\n') i++;
comments.push({ kind: 'LineComment', start, end: i, text: source.slice(start, i) });
continue;
}
// Block comment (supports nesting per the Rhai book)
if (ch === '/' && source[i + 1] === '*') {
const start = i;
i += 2;
let depth = 1;
while (i < n && depth > 0) {
if (source[i] === '/' && source[i + 1] === '*') {
depth++;
i += 2;
} else if (source[i] === '*' && source[i + 1] === '/') {
depth--;
i += 2;
} else {
i++;
}
}
comments.push({ kind: 'BlockComment', start, end: i, text: source.slice(start, i) });
continue;
}
// Strings: " ... " (escape-aware, single-line by convention) and
// ` ... ` (raw, multi-line). We tokenize the entire literal including
// quotes; the parser only cares about its position and text.
if (ch === '"' || ch === '`') {
const quote = ch;
const start = i;
i++;
while (i < n) {
const c = source[i];
if (c === '\\' && quote === '"') {
i += 2;
continue;
}
if (c === quote) {
i++;
break;
}
i++;
}
tokens.push({ kind: 'String', start, end: i, text: source.slice(start, i) });
continue;
}
// Numbers: hex, binary, decimal, optional `.frac`, optional exponent.
// Underscores are allowed as digit separators per Rhai.
if (isDigit(ch)) {
const start = i;
if (ch === '0' && (source[i + 1] === 'x' || source[i + 1] === 'X')) {
i += 2;
while (i < n && (isHexDigit(source[i]) || source[i] === '_')) i++;
} else if (ch === '0' && (source[i + 1] === 'b' || source[i + 1] === 'B')) {
i += 2;
while (i < n && (source[i] === '0' || source[i] === '1' || source[i] === '_')) i++;
} else {
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
if (source[i] === '.' && isDigit(source[i + 1])) {
i++;
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
}
if (source[i] === 'e' || source[i] === 'E') {
i++;
if (source[i] === '+' || source[i] === '-') i++;
while (i < n && isDigit(source[i])) i++;
}
}
tokens.push({ kind: 'Number', start, end: i, text: source.slice(start, i) });
continue;
}
// Identifier or keyword
if (isIdentStart(ch)) {
const start = i;
i++;
while (i < n && isIdentCont(source[i])) i++;
const text = source.slice(start, i);
tokens.push({
kind: KEYWORDS.has(text) ? 'Keyword' : 'Ident',
start,
end: i,
text
});
continue;
}
// Multi-char operators
let matched = false;
for (const op of MULTI_CHAR_OPS) {
if (source.startsWith(op, i)) {
tokens.push({ kind: 'Operator', start: i, end: i + op.length, text: op });
i += op.length;
matched = true;
break;
}
}
if (matched) continue;
// Single-char operator
if (SINGLE_CHAR_OPS.has(ch)) {
tokens.push({ kind: 'Operator', start: i, end: i + 1, text: ch });
i++;
continue;
}
// Punctuation
if (PUNCTS.has(ch)) {
tokens.push({ kind: 'Punct', start: i, end: i + 1, text: ch });
i++;
continue;
}
// Unrecognized: skip and let the parser report the gap if needed.
i++;
}
tokens.push({ kind: 'EOF', start: n, end: n, text: '' });
return { tokens, comments };
}
function isDigit(c: string): boolean {
return c >= '0' && c <= '9';
}
function isHexDigit(c: string): boolean {
return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
function isIdentStart(c: string): boolean {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c === '_';
}
function isIdentCont(c: string): boolean {
return isIdentStart(c) || isDigit(c);
}