// Tokenizer for the dashboard's Rhai parser. // // Produces a flat array of tokens (eager — Rhai scripts in the dashboard // are small, 20–200 lines typical) plus a separate list of comments. The // parser only sees tokens; comments are handed to the formatter so it // can re-emit them at the right positions. // // Keyword and operator lists trace back to the upstream TextMate grammar // (rhaiscript/vscode-rhai). We don't copy any grammar bytes. import type { Comment, Range } from './ast'; export type TokenKind = | 'Ident' | 'Keyword' | 'Number' | 'String' | 'Punct' | 'Operator' | 'EOF'; export interface Token extends Range { kind: TokenKind; // For Ident/Keyword/Punct/Operator: the literal source text. For // Number/String: the full literal including quotes. text: string; } export const KEYWORDS = new Set([ 'let', 'const', 'fn', 'if', 'else', 'while', 'loop', 'do', 'for', 'in', 'return', 'break', 'continue', 'switch', 'case', 'default', 'true', 'false', 'null', 'try', 'catch', 'throw', 'as', 'is', 'private' ]); // Multi-char operators, longest first so the lexer picks them up greedily. const MULTI_CHAR_OPS = [ '??=', '..=', '??', '..', '::', '==', '!=', '<=', '>=', '&&', '||', '<<', '>>', '+=', '-=', '*=', '/=', '%=', '=>', '->' ]; const SINGLE_CHAR_OPS = new Set(['+', '-', '*', '/', '%', '<', '>', '!', '&', '|', '^', '~', '=', '?']); // `#` is included so we can recognize the start of `#{` object-map literals; // the lexer emits it as a separate `Punct` and the parser combines it with // the following `{`. const PUNCTS = new Set(['(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '#']); export interface LexResult { tokens: Token[]; comments: Comment[]; } export function tokenize(source: string): LexResult { const tokens: Token[] = []; const comments: Comment[] = []; let i = 0; const n = source.length; while (i < n) { const ch = source[i]; // Whitespace if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') { i++; continue; } // Line comment if (ch === '/' && source[i + 1] === '/') { const start = i; while (i < n && source[i] !== '\n') i++; comments.push({ kind: 'LineComment', start, end: i, text: source.slice(start, i) }); continue; } // Block comment (supports nesting per the Rhai book) if (ch === '/' && source[i + 1] === '*') { const start = i; i += 2; let depth = 1; while (i < n && depth > 0) { if (source[i] === '/' && source[i + 1] === '*') { depth++; i += 2; } else if (source[i] === '*' && source[i + 1] === '/') { depth--; i += 2; } else { i++; } } comments.push({ kind: 'BlockComment', start, end: i, text: source.slice(start, i) }); continue; } // Strings: " ... " (escape-aware, single-line by convention) and // ` ... ` (raw, multi-line). We tokenize the entire literal including // quotes; the parser only cares about its position and text. if (ch === '"' || ch === '`') { const quote = ch; const start = i; i++; while (i < n) { const c = source[i]; if (c === '\\' && quote === '"') { i += 2; continue; } if (c === quote) { i++; break; } i++; } tokens.push({ kind: 'String', start, end: i, text: source.slice(start, i) }); continue; } // Numbers: hex, binary, decimal, optional `.frac`, optional exponent. // Underscores are allowed as digit separators per Rhai. if (isDigit(ch)) { const start = i; if (ch === '0' && (source[i + 1] === 'x' || source[i + 1] === 'X')) { i += 2; while (i < n && (isHexDigit(source[i]) || source[i] === '_')) i++; } else if (ch === '0' && (source[i + 1] === 'b' || source[i + 1] === 'B')) { i += 2; while (i < n && (source[i] === '0' || source[i] === '1' || source[i] === '_')) i++; } else { while (i < n && (isDigit(source[i]) || source[i] === '_')) i++; if (source[i] === '.' && isDigit(source[i + 1])) { i++; while (i < n && (isDigit(source[i]) || source[i] === '_')) i++; } if (source[i] === 'e' || source[i] === 'E') { i++; if (source[i] === '+' || source[i] === '-') i++; while (i < n && isDigit(source[i])) i++; } } tokens.push({ kind: 'Number', start, end: i, text: source.slice(start, i) }); continue; } // Identifier or keyword if (isIdentStart(ch)) { const start = i; i++; while (i < n && isIdentCont(source[i])) i++; const text = source.slice(start, i); tokens.push({ kind: KEYWORDS.has(text) ? 'Keyword' : 'Ident', start, end: i, text }); continue; } // Multi-char operators let matched = false; for (const op of MULTI_CHAR_OPS) { if (source.startsWith(op, i)) { tokens.push({ kind: 'Operator', start: i, end: i + op.length, text: op }); i += op.length; matched = true; break; } } if (matched) continue; // Single-char operator if (SINGLE_CHAR_OPS.has(ch)) { tokens.push({ kind: 'Operator', start: i, end: i + 1, text: ch }); i++; continue; } // Punctuation if (PUNCTS.has(ch)) { tokens.push({ kind: 'Punct', start: i, end: i + 1, text: ch }); i++; continue; } // Unrecognized: skip and let the parser report the gap if needed. i++; } tokens.push({ kind: 'EOF', start: n, end: n, text: '' }); return { tokens, comments }; } function isDigit(c: string): boolean { return c >= '0' && c <= '9'; } function isHexDigit(c: string): boolean { return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } function isIdentStart(c: string): boolean { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c === '_'; } function isIdentCont(c: string): boolean { return isIdentStart(c) || isDigit(c); }