diff --git a/src/irc/lexer.cpp b/src/irc/lexer.cpp index 0a8083b..47a4161 100644 --- a/src/irc/lexer.cpp +++ b/src/irc/lexer.cpp @@ -1,9 +1,10 @@ #include "lexer.h" #include +#include namespace nix_irc { -Lexer::Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {} +Lexer::Lexer(std::string input) : input(std::move(input)), pos(0), line(1), col(1) {} std::vector Lexer::tokenize() { #define TOKEN(t) \ @@ -109,9 +110,10 @@ std::vector Lexer::tokenize() { // If we found > and there's content, it's a lookup path if (end < input.size() && input[end] == '>' && end > pos + 1) { std::string path = input.substr(pos + 1, end - pos - 1); + size_t consumed = end - pos + 1; tokens.push_back({Token::LOOKUP_PATH, path, line, col}); pos = end + 1; - col += (end - pos + 1); + col += consumed; is_lookup_path = true; } @@ -123,8 +125,14 @@ std::vector Lexer::tokenize() { } else if (c == '!') { emit(TOKEN(NOT)); } else if (c == '.') { + // Relative paths: ./foo and ../foo + if (pos + 1 < input.size() && input[pos + 1] == '/') { + tokenize_path(); + } else if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '/') { + tokenize_path(); + } // Check for ellipsis (...) - if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') { + else if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') { tokens.push_back(TOKEN(ELLIPSIS)); pos += 3; col += 3; @@ -176,8 +184,8 @@ std::vector Lexer::tokenize() { tokenize_ident(); } } else { - pos++; - col++; + throw std::runtime_error("Unexpected character '" + std::string(1, c) + "' at " + + std::to_string(line) + ":" + std::to_string(col)); } } tokens.push_back({Token::EOF_, "", line, col}); @@ -210,10 +218,16 @@ void Lexer::skip_whitespace() { } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') { // Block comment /* ... */ // Note: Nix block comments do NOT nest + size_t start_line = line; + size_t start_col = col; + bool terminated = false; pos += 2; // Skip /* + col += 2; while (pos + 1 < input.size()) { if (input[pos] == '*' && input[pos + 1] == '/') { pos += 2; // Skip */ + col += 2; + terminated = true; break; } if (input[pos] == '\n') { @@ -224,6 +238,10 @@ void Lexer::skip_whitespace() { } pos++; } + if (!terminated) { + throw std::runtime_error("Unterminated block comment at " + std::to_string(start_line) + + ":" + std::to_string(start_col)); + } } else { break; } @@ -231,13 +249,17 @@ void Lexer::skip_whitespace() { } void Lexer::tokenize_string() { + size_t start_line = line; + size_t start_col = col; pos++; + col++; std::string s; bool has_interp = false; while (pos < input.size() && input[pos] != '"') { if (input[pos] == '\\' && pos + 1 < input.size()) { pos++; + col++; switch (input[pos]) { case 'n': s += '\n'; @@ -262,21 +284,35 @@ void Lexer::tokenize_string() { break; } pos++; + col++; } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') { // Found interpolation marker has_interp = true; s += input[pos]; // Keep $ in raw string pos++; + col++; } else { + if (input[pos] == '\n') { + s += input[pos]; + pos++; + line++; + col = 1; + continue; + } s += input[pos]; pos++; + col++; } } + if (pos >= input.size()) { + throw std::runtime_error("Unterminated string at " + std::to_string(start_line) + ":" + + std::to_string(start_col)); + } pos++; + col++; Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING; - tokens.push_back({type, s, line, col}); - col += s.size() + 2; + tokens.push_back({type, s, start_line, start_col}); } void Lexer::tokenize_indented_string() { diff --git a/src/irc/lexer.h b/src/irc/lexer.h index f83087b..3804ed7 100644 --- a/src/irc/lexer.h +++ b/src/irc/lexer.h @@ -68,12 +68,12 @@ struct Token { class Lexer { public: - Lexer(const std::string& input); + explicit Lexer(std::string input); std::vector tokenize(); private: std::vector tokens; - const std::string& input; + std::string input; size_t pos; size_t line; size_t col; diff --git a/src/irc/parser.cpp b/src/irc/parser.cpp index 9b05b55..7180c8e 100644 --- a/src/irc/parser.cpp +++ b/src/irc/parser.cpp @@ -18,6 +18,116 @@ static std::string trim(const std::string& s) { return s.substr(start, end - start + 1); } +static const char* token_type_name(Token::Type type) { + switch (type) { + case Token::LPAREN: + return "LPAREN"; + case Token::RPAREN: + return "RPAREN"; + case Token::LBRACE: + return "LBRACE"; + case Token::RBRACE: + return "RBRACE"; + case Token::LBRACKET: + return "LBRACKET"; + case Token::RBRACKET: + return "RBRACKET"; + case Token::IDENT: + return "IDENT"; + case Token::STRING: + return "STRING"; + case Token::STRING_INTERP: + return "STRING_INTERP"; + case Token::INDENTED_STRING: + return "INDENTED_STRING"; + case Token::INDENTED_STRING_INTERP: + return "INDENTED_STRING_INTERP"; + case Token::PATH: + return "PATH"; + case Token::LOOKUP_PATH: + return "LOOKUP_PATH"; + case Token::INT: + return "INT"; + case Token::FLOAT: + return "FLOAT"; + case Token::URI: + return "URI"; + case Token::BOOL: + return "BOOL"; + case Token::LET: + return "LET"; + case Token::IN: + return "IN"; + case Token::REC: + return "REC"; + case Token::IF: + return "IF"; + case Token::THEN: + return "THEN"; + case Token::ELSE: + return "ELSE"; + case Token::ASSERT: + return "ASSERT"; + case Token::WITH: + return "WITH"; + case Token::INHERIT: + return "INHERIT"; + case Token::IMPORT: + return "IMPORT"; + case Token::DOT: + return "DOT"; + case Token::SEMICOLON: + return "SEMICOLON"; + case Token::COLON: + return "COLON"; + case Token::EQUALS: + return "EQUALS"; + case Token::AT: + return "AT"; + case Token::COMMA: + return "COMMA"; + case Token::QUESTION: + return "QUESTION"; + case Token::ELLIPSIS: + return "ELLIPSIS"; + case Token::PLUS: + return "PLUS"; + case Token::MINUS: + return "MINUS"; + case Token::STAR: + return "STAR"; + case Token::SLASH: + return "SLASH"; + case Token::CONCAT: + return "CONCAT"; + case Token::MERGE: + return "MERGE"; + case Token::EQEQ: + return "EQEQ"; + case Token::NE: + return "NE"; + case Token::LT: + return "LT"; + case Token::GT: + return "GT"; + case Token::LE: + return "LE"; + case Token::GE: + return "GE"; + case Token::AND: + return "AND"; + case Token::OR: + return "OR"; + case Token::IMPL: + return "IMPL"; + case Token::NOT: + return "NOT"; + case Token::EOF_: + return "EOF"; + } + return "UNKNOWN"; +} + static std::string read_file(const std::string& path) { FILE* f = fopen(path.c_str(), "r"); if (!f) { @@ -85,14 +195,17 @@ public: bool expect(Token::Type type) { if (current().type != type) { - throw std::runtime_error( - "Expected token " + std::to_string(type) + " but got " + std::to_string(current().type) + - " at " + std::to_string(current().line) + ":" + std::to_string(current().col)); + throw std::runtime_error("Expected token " + std::string(token_type_name(type)) + + " but got " + token_type_name(current().type) + " at " + + std::to_string(current().line) + ":" + + std::to_string(current().col)); } advance(); return true; } + bool is_right_associative(Token::Type type) { return type == Token::IMPL; } + // Get operator precedence (higher = tighter binding) int get_precedence(Token::Type type) { switch (type) { @@ -263,7 +376,8 @@ public: Token op_token = current(); advance(); - auto right = parse_binary_op(prec + 1); + int next_prec = is_right_associative(op_token.type) ? prec : prec + 1; + auto right = parse_binary_op(next_prec); left = std::make_shared(BinaryOpNode(token_to_binop(op_token.type), left, right)); } diff --git a/src/irc/types.cpp b/src/irc/types.cpp index 1ee8a8e..82960b9 100644 --- a/src/irc/types.cpp +++ b/src/irc/types.cpp @@ -16,8 +16,8 @@ BinaryOpNode::BinaryOpNode(BinaryOp o, std::shared_ptr l, std::shared_ptr< : op(o), left(std::move(l)), right(std::move(r)), line(ln) {} // UnaryOpNode constructor -UnaryOpNode::UnaryOpNode(UnaryOp o, std::shared_ptr operand, uint32_t l) - : op(o), operand(std::move(operand)), line(l) {} +UnaryOpNode::UnaryOpNode(UnaryOp o, std::shared_ptr operand_ptr, uint32_t l) + : op(o), operand(std::move(operand_ptr)), line(l) {} // SelectNode constructor SelectNode::SelectNode(std::shared_ptr e, std::shared_ptr a, uint32_t l)