PiCloud/dashboard/src/lib/rhai/lexer.ts

// Tokenizer for the dashboard's Rhai parser.
//
// Produces a flat array of tokens (eager — Rhai scripts in the dashboard
// are small, 20–200 lines typical) plus a separate list of comments. The
// parser only sees tokens; comments are handed to the formatter so it
// can re-emit them at the right positions.
//
// Keyword and operator lists trace back to the upstream TextMate grammar
// (rhaiscript/vscode-rhai). We don't copy any grammar bytes.

import type { Comment, Range } from './ast';

export type TokenKind =
	| 'Ident'
	| 'Keyword'
	| 'Number'
	| 'String'
	| 'Punct'
	| 'Operator'
	| 'EOF';

export interface Token extends Range {
	kind: TokenKind;
	// For Ident/Keyword/Punct/Operator: the literal source text. For
	// Number/String: the full literal including quotes.
	text: string;
}

export const KEYWORDS = new Set([
	'let',
	'const',
	'fn',
	'if',
	'else',
	'while',
	'loop',
	'do',
	'for',
	'in',
	'return',
	'break',
	'continue',
	'switch',
	'case',
	'default',
	'true',
	'false',
	'null',
	'try',
	'catch',
	'throw',
	'as',
	'is',
	'private'
]);

// Multi-char operators, longest first so the lexer picks them up greedily.
const MULTI_CHAR_OPS = [
	'??=',
	'..=',
	'??',
	'..',
	'::',
	'==',
	'!=',
	'<=',
	'>=',
	'&&',
	'||',
	'<<',
	'>>',
	'+=',
	'-=',
	'*=',
	'/=',
	'%=',
	'=>',
	'->'
];

const SINGLE_CHAR_OPS = new Set(['+', '-', '*', '/', '%', '<', '>', '!', '&', '|', '^', '~', '=', '?']);

// `#` is included so we can recognize the start of `#{` object-map literals;
// the lexer emits it as a separate `Punct` and the parser combines it with
// the following `{`.
const PUNCTS = new Set(['(', ')', '{', '}', '[', ']', ';', ',', '.', ':', '#']);

export interface LexResult {
	tokens: Token[];
	comments: Comment[];
}

export function tokenize(source: string): LexResult {
	const tokens: Token[] = [];
	const comments: Comment[] = [];
	let i = 0;
	const n = source.length;

	while (i < n) {
		const ch = source[i];

		// Whitespace
		if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') {
			i++;
			continue;
		}

		// Line comment
		if (ch === '/' && source[i + 1] === '/') {
			const start = i;
			while (i < n && source[i] !== '\n') i++;
			comments.push({ kind: 'LineComment', start, end: i, text: source.slice(start, i) });
			continue;
		}

		// Block comment (supports nesting per the Rhai book)
		if (ch === '/' && source[i + 1] === '*') {
			const start = i;
			i += 2;
			let depth = 1;
			while (i < n && depth > 0) {
				if (source[i] === '/' && source[i + 1] === '*') {
					depth++;
					i += 2;
				} else if (source[i] === '*' && source[i + 1] === '/') {
					depth--;
					i += 2;
				} else {
					i++;
				}
			}
			comments.push({ kind: 'BlockComment', start, end: i, text: source.slice(start, i) });
			continue;
		}

		// Strings: " ... " (escape-aware, single-line by convention) and
		// ` ... ` (raw, multi-line). We tokenize the entire literal including
		// quotes; the parser only cares about its position and text.
		if (ch === '"' || ch === '`') {
			const quote = ch;
			const start = i;
			i++;
			while (i < n) {
				const c = source[i];
				if (c === '\\' && quote === '"') {
					i += 2;
					continue;
				}
				if (c === quote) {
					i++;
					break;
				}
				i++;
			}
			tokens.push({ kind: 'String', start, end: i, text: source.slice(start, i) });
			continue;
		}

		// Numbers: hex, binary, decimal, optional `.frac`, optional exponent.
		// Underscores are allowed as digit separators per Rhai.
		if (isDigit(ch)) {
			const start = i;
			if (ch === '0' && (source[i + 1] === 'x' || source[i + 1] === 'X')) {
				i += 2;
				while (i < n && (isHexDigit(source[i]) || source[i] === '_')) i++;
			} else if (ch === '0' && (source[i + 1] === 'b' || source[i + 1] === 'B')) {
				i += 2;
				while (i < n && (source[i] === '0' || source[i] === '1' || source[i] === '_')) i++;
			} else {
				while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
				if (source[i] === '.' && isDigit(source[i + 1])) {
					i++;
					while (i < n && (isDigit(source[i]) || source[i] === '_')) i++;
				}
				if (source[i] === 'e' || source[i] === 'E') {
					i++;
					if (source[i] === '+' || source[i] === '-') i++;
					while (i < n && isDigit(source[i])) i++;
				}
			}
			tokens.push({ kind: 'Number', start, end: i, text: source.slice(start, i) });
			continue;
		}

		// Identifier or keyword
		if (isIdentStart(ch)) {
			const start = i;
			i++;
			while (i < n && isIdentCont(source[i])) i++;
			const text = source.slice(start, i);
			tokens.push({
				kind: KEYWORDS.has(text) ? 'Keyword' : 'Ident',
				start,
				end: i,
				text
			});
			continue;
		}

		// Multi-char operators
		let matched = false;
		for (const op of MULTI_CHAR_OPS) {
			if (source.startsWith(op, i)) {
				tokens.push({ kind: 'Operator', start: i, end: i + op.length, text: op });
				i += op.length;
				matched = true;
				break;
			}
		}
		if (matched) continue;

		// Single-char operator
		if (SINGLE_CHAR_OPS.has(ch)) {
			tokens.push({ kind: 'Operator', start: i, end: i + 1, text: ch });
			i++;
			continue;
		}

		// Punctuation
		if (PUNCTS.has(ch)) {
			tokens.push({ kind: 'Punct', start: i, end: i + 1, text: ch });
			i++;
			continue;
		}

		// Unrecognized: skip and let the parser report the gap if needed.
		i++;
	}

	tokens.push({ kind: 'EOF', start: n, end: n, text: '' });
	return { tokens, comments };
}

function isDigit(c: string): boolean {
	return c >= '0' && c <= '9';
}

function isHexDigit(c: string): boolean {
	return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}

function isIdentStart(c: string): boolean {
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c === '_';
}

function isIdentCont(c: string): boolean {
	return isIdentStart(c) || isDigit(c);
}