38029-vm/core/parser.py

import re

class Lexer:
    def __init__(self, code):
        self.code = code
        self.tokens = []
        self.pos = 0

        # Using chr() to build regex parts and avoid tool-induced corruption.
        LB = chr(91)
        RB = chr(93)
        DQ = chr(34)
        SQ = chr(39)
        BS = chr(92)

        # List every operator individually to avoid character set issues.
        OP_LIST = '==|~=|<=|>=|\.\.\.|\.\.|>>|<<|\+|\-|\*|/|%|\^|#|=|\<|\>|\(|\)|\{|\}|' + BS + LB + '|' + BS + RB + '|;|:|,|\.'

        self.rules = [
            ('COMMENT', re.compile('--' + LB + LB + '.*?' + RB + RB + '|--.*', re.DOTALL)),
            ('STRING', re.compile(DQ + '(?:' + BS + BS + '.|[^' + DQ + BS + BS + '])*' + DQ + '|' + SQ + '(?:' + BS + BS + '.|[^' + SQ + BS + BS + '])*' + SQ + '|' + LB + LB + '.*?' + RB + RB, re.DOTALL)),
            ('NUMBER', re.compile(r'\d+\.?\d*')),
            ('KEYWORD', re.compile(r'\b(and|break|do|else|elseif|end|false|for|function|if|in|local|nil|not|or|repeat|return|then|true|until|while)\b')),
            ('IDENT', re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')),
            ('OP', re.compile(OP_LIST)),
            ('SPACE', re.compile(r'\s+'))
        ]

    def tokenize(self):
        while self.pos < len(self.code):
            match = None
            for name, regex in self.rules:
                match = regex.match(self.code, self.pos)
                if match:
                    if name != 'SPACE' and name != 'COMMENT':
                        self.tokens.append((name, match.group(0)))
                    self.pos = match.end()
                    break
            if not match:
                self.pos += 1
        return self.tokens

class Parser:
    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

    def peek(self, offset=0):
        index = self.pos + offset
        return self.tokens[index] if index < len(self.tokens) else (None, None)

    def consume(self, expected_type=None, expected_value=None):
        token = self.peek()
        if not token or not token[0]: return None
        if expected_type and token[0] != expected_type: return None
        if expected_value and token[1] != expected_value: return None
        self.pos += 1
        return token

    def parse(self):
        nodes = []
        while self.pos < len(self.tokens):
            node = self.parse_statement()
            if node:
                nodes.append(node)
            else:
                self.pos += 1
        return nodes

    def parse_statement(self):
        token = self.peek()
        if not token or not token[0]: return None

        if token[1] == 'local':
            self.consume()
            ident = self.consume('IDENT')
            if ident:
                if self.peek()[1] == '=':
                    self.consume()
                    val = self.parse_expression()
                    return {'type': 'assign', 'name': ident[1], 'value': val, 'local': True}
            return None

        if token[0] == 'IDENT':
            ident = self.consume()[1]
            next_token = self.peek()
            if next_token[1] == '(':
                self.consume()
                args = []
                while self.peek()[1] and self.peek()[1] != ')':
                    args.append(self.parse_expression())
                    if self.peek()[1] == ',':
                        self.consume()
                self.consume('OP', ')')
                return {'type': 'call', 'name': ident, 'args': args}
            elif next_token[1] == '=':
                self.consume()
                val = self.parse_expression()
                return {'type': 'assign', 'name': ident, 'value': val, 'local': False}

        return None

    def parse_expression(self):
        token = self.consume()
        if not token: return None
        return {'type': token[0], 'value': token[1]}