import re class Lexer: def __init__(self, code): self.code = code self.tokens = [] self.pos = 0 # Using chr() to build regex parts and avoid tool-induced corruption. LB = chr(91) RB = chr(93) DQ = chr(34) SQ = chr(39) BS = chr(92) # List every operator individually to avoid character set issues. OP_LIST = '==|~=|<=|>=|\.\.\.|\.\.|>>|<<|\+|\-|\*|/|%|\^|#|=|\<|\>|\(|\)|\{|\}|' + BS + LB + '|' + BS + RB + '|;|:|,|\.' self.rules = [ ('COMMENT', re.compile('--' + LB + LB + '.*?' + RB + RB + '|--.*', re.DOTALL)), ('STRING', re.compile(DQ + '(?:' + BS + BS + '.|[^' + DQ + BS + BS + '])*' + DQ + '|' + SQ + '(?:' + BS + BS + '.|[^' + SQ + BS + BS + '])*' + SQ + '|' + LB + LB + '.*?' + RB + RB, re.DOTALL)), ('NUMBER', re.compile(r'\d+\.?\d*')), ('KEYWORD', re.compile(r'\b(and|break|do|else|elseif|end|false|for|function|if|in|local|nil|not|or|repeat|return|then|true|until|while)\b')), ('IDENT', re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')), ('OP', re.compile(OP_LIST)), ('SPACE', re.compile(r'\s+')) ] def tokenize(self): while self.pos < len(self.code): match = None for name, regex in self.rules: match = regex.match(self.code, self.pos) if match: if name != 'SPACE' and name != 'COMMENT': self.tokens.append((name, match.group(0))) self.pos = match.end() break if not match: self.pos += 1 return self.tokens class Parser: def __init__(self, tokens): self.tokens = tokens self.pos = 0 def peek(self, offset=0): index = self.pos + offset return self.tokens[index] if index < len(self.tokens) else (None, None) def consume(self, expected_type=None, expected_value=None): token = self.peek() if not token or not token[0]: return None if expected_type and token[0] != expected_type: return None if expected_value and token[1] != expected_value: return None self.pos += 1 return token def parse(self): nodes = [] while self.pos < len(self.tokens): node = self.parse_statement() if node: nodes.append(node) else: self.pos += 1 return nodes def parse_statement(self): token = self.peek() if not token or not token[0]: return None if token[1] == 'local': self.consume() ident = self.consume('IDENT') if ident: if self.peek()[1] == '=': self.consume() val = self.parse_expression() return {'type': 'assign', 'name': ident[1], 'value': val, 'local': True} return None if token[0] == 'IDENT': ident = self.consume()[1] next_token = self.peek() if next_token[1] == '(': self.consume() args = [] while self.peek()[1] and self.peek()[1] != ')': args.append(self.parse_expression()) if self.peek()[1] == ',': self.consume() self.consume('OP', ')') return {'type': 'call', 'name': ident, 'args': args} elif next_token[1] == '=': self.consume() val = self.parse_expression() return {'type': 'assign', 'name': ident, 'value': val, 'local': False} return None def parse_expression(self): token = self.consume() if not token: return None return {'type': token[0], 'value': token[1]}