Foundation for upcoming editor features (scope-aware autocomplete, goto-def / find-usages, source formatter). Hand-rolled recursive descent in TypeScript with Pratt precedence climbing for expressions, error-tolerant so partial trees stay usable while the user is typing. Symbol table walks the AST to produce per-scope declarations, usage sites, and object-literal field maps. Vitest added as a dev-only runner; no editor wiring in this commit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
249 lines
5.7 KiB
TypeScript
249 lines
5.7 KiB
TypeScript
// Tokenizer for the dashboard's Rhai parser.
|
||
//
|
||
// Produces a flat array of tokens (eager — Rhai scripts in the dashboard
|
||
// are small, 20–200 lines typical) plus a separate list of comments. The
|
||
// parser only sees tokens; comments are handed to the formatter so it
|
||
// can re-emit them at the right positions.
|
||
//
|
||
// Keyword and operator lists trace back to the upstream TextMate grammar
|
||
// (rhaiscript/vscode-rhai). We don't copy any grammar bytes.
|
||
|
||
import type { Comment, Range } from './ast';
|
||
|
||
export type TokenKind =
|
||
| 'Ident'
|
||
| 'Keyword'
|
||
| 'Number'
|
||
| 'String'
|
||
| 'Punct'
|
||
| 'Operator'
|
||
| 'EOF';
|
||
|
||
export interface Token extends Range {
|
||
kind: TokenKind;
|
||
// For Ident/Keyword/Punct/Operator: the literal source text. For
|
||
// Number/String: the full literal including quotes.
|
||
text: string;
|
||
}
|
||
|
||
export const KEYWORDS = new Set([
|
||
'let',
|
||
'const',
|
||
'fn',
|
||
'if',
|
||
'else',
|
||
'while',
|
||
'loop',
|
||
'do',
|
||
'for',
|
||
'in',
|
||
'return',
|
||
'break',
|
||
'continue',
|
||
'switch',
|
||
'case',
|
||
'default',
|
||
'true',
|
||
'false',
|
||
'null',
|
||
'try',
|
||
'catch',
|
||
'throw',
|
||
'as',
|
||
'is',
|
||
'private'
|
||
]);
|
||
|
||
// Multi-char operators, longest first so the lexer picks them up greedily.
|
||
const MULTI_CHAR_OPS = [
|
||
'??=',
|
||
'..=',
|
||
'??',
|
||
'..',
|
||
'::',
|
||
'==',
|
||
'!=',
|
||
'<=',
|
||
'>=',
|
||
'&&',
|
||
'||',
|
||
'<<',
|
||
'>>',
|
||
'+=',
|
||
'-=',
|
||
'*=',
|
||
'/=',
|
||
'%=',
|
||
'=>',
|
||
'->'
|
||
];
|
||
|
||
const SINGLE_CHAR_OPS = new Set(['+', '-', '*', '/', '%', '<', '>', '!', '&', '|', '^', '~', '=', '?']);
|
||
|
||
// `#` is included so we can recognize the start of `#{` object-map literals;
|
||
// the lexer emits it as a separate `Punct` and the parser combines it with
|
||
// the following `{`.
|
||
const PUNCTS = new Set(['(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '#']);
|
||
|
||
export interface LexResult {
|
||
tokens: Token[];
|
||
comments: Comment[];
|
||
}
|
||
|
||
export function tokenize(source: string): LexResult {
|
||
const tokens: Token[] = [];
|
||
const comments: Comment[] = [];
|
||
let i = 0;
|
||
const n = source.length;
|
||
|
||
while (i < n) {
|
||
const ch = source[i];
|
||
|
||
// Whitespace
|
||
if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') {
|
||
i++;
|
||
continue;
|
||
}
|
||
|
||
// Line comment
|
||
if (ch === '/' && source[i + 1] === '/') {
|
||
const start = i;
|
||
while (i < n && source[i] !== '\n') i++;
|
||
comments.push({ kind: 'LineComment', start, end: i, text: source.slice(start, i) });
|
||
continue;
|
||
}
|
||
|
||
// Block comment (supports nesting per the Rhai book)
|
||
if (ch === '/' && source[i + 1] === '*') {
|
||
const start = i;
|
||
i += 2;
|
||
let depth = 1;
|
||
while (i < n && depth > 0) {
|
||
if (source[i] === '/' && source[i + 1] === '*') {
|
||
depth++;
|
||
i += 2;
|
||
} else if (source[i] === '*' && source[i + 1] === '/') {
|
||
depth--;
|
||
i += 2;
|
||
} else {
|
||
i++;
|
||
}
|
||
}
|
||
comments.push({ kind: 'BlockComment', start, end: i, text: source.slice(start, i) });
|
||
continue;
|
||
}
|
||
|
||
// Strings: " ... " (escape-aware, single-line by convention) and
|
||
// ` ... ` (raw, multi-line). We tokenize the entire literal including
|
||
// quotes; the parser only cares about its position and text.
|
||
if (ch === '"' || ch === '`') {
|
||
const quote = ch;
|
||
const start = i;
|
||
i++;
|
||
while (i < n) {
|
||
const c = source[i];
|
||
if (c === '\\' && quote === '"') {
|
||
i += 2;
|
||
continue;
|
||
}
|
||
if (c === quote) {
|
||
i++;
|
||
break;
|
||
}
|
||
i++;
|
||
}
|
||
tokens.push({ kind: 'String', start, end: i, text: source.slice(start, i) });
|
||
continue;
|
||
}
|
||
|
||
// Numbers: hex, binary, decimal, optional `.frac`, optional exponent.
|
||
// Underscores are allowed as digit separators per Rhai.
|
||
if (isDigit(ch)) {
|
||
const start = i;
|
||
if (ch === '0' && (source[i + 1] === 'x' || source[i + 1] === 'X')) {
|
||
i += 2;
|
||
while (i < n && (isHexDigit(source[i]) || source[i] === '_')) i++;
|
||
} else if (ch === '0' && (source[i + 1] === 'b' || source[i + 1] === 'B')) {
|
||
i += 2;
|
||
while (i < n && (source[i] === '0' || source[i] === '1' || source[i] === '_')) i++;
|
||
} else {
|
||
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
|
||
if (source[i] === '.' && isDigit(source[i + 1])) {
|
||
i++;
|
||
while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
|
||
}
|
||
if (source[i] === 'e' || source[i] === 'E') {
|
||
i++;
|
||
if (source[i] === '+' || source[i] === '-') i++;
|
||
while (i < n && isDigit(source[i])) i++;
|
||
}
|
||
}
|
||
tokens.push({ kind: 'Number', start, end: i, text: source.slice(start, i) });
|
||
continue;
|
||
}
|
||
|
||
// Identifier or keyword
|
||
if (isIdentStart(ch)) {
|
||
const start = i;
|
||
i++;
|
||
while (i < n && isIdentCont(source[i])) i++;
|
||
const text = source.slice(start, i);
|
||
tokens.push({
|
||
kind: KEYWORDS.has(text) ? 'Keyword' : 'Ident',
|
||
start,
|
||
end: i,
|
||
text
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// Multi-char operators
|
||
let matched = false;
|
||
for (const op of MULTI_CHAR_OPS) {
|
||
if (source.startsWith(op, i)) {
|
||
tokens.push({ kind: 'Operator', start: i, end: i + op.length, text: op });
|
||
i += op.length;
|
||
matched = true;
|
||
break;
|
||
}
|
||
}
|
||
if (matched) continue;
|
||
|
||
// Single-char operator
|
||
if (SINGLE_CHAR_OPS.has(ch)) {
|
||
tokens.push({ kind: 'Operator', start: i, end: i + 1, text: ch });
|
||
i++;
|
||
continue;
|
||
}
|
||
|
||
// Punctuation
|
||
if (PUNCTS.has(ch)) {
|
||
tokens.push({ kind: 'Punct', start: i, end: i + 1, text: ch });
|
||
i++;
|
||
continue;
|
||
}
|
||
|
||
// Unrecognized: skip and let the parser report the gap if needed.
|
||
i++;
|
||
}
|
||
|
||
tokens.push({ kind: 'EOF', start: n, end: n, text: '' });
|
||
return { tokens, comments };
|
||
}
|
||
|
||
function isDigit(c: string): boolean {
|
||
return c >= '0' && c <= '9';
|
||
}
|
||
|
||
function isHexDigit(c: string): boolean {
|
||
return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||
}
|
||
|
||
function isIdentStart(c: string): boolean {
|
||
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c === '_';
|
||
}
|
||
|
||
function isIdentCont(c: string): boolean {
|
||
return isIdentStart(c) || isDigit(c);
|
||
}
|