irc/parser: fix lexer ownership, errors, and implication parsing

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I12a6b52ec1c0edff605d02393eafde896a6a6964
This commit is contained in:
raf 2026-04-24 18:35:16 +03:00
commit b319ef6f3f
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
4 changed files with 165 additions and 15 deletions

View file

@ -1,9 +1,10 @@
#include "lexer.h" #include "lexer.h"
#include <cctype> #include <cctype>
#include <stdexcept>
namespace nix_irc { namespace nix_irc {
Lexer::Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {} Lexer::Lexer(std::string input) : input(std::move(input)), pos(0), line(1), col(1) {}
std::vector<Token> Lexer::tokenize() { std::vector<Token> Lexer::tokenize() {
#define TOKEN(t) \ #define TOKEN(t) \
@ -109,9 +110,10 @@ std::vector<Token> Lexer::tokenize() {
// If we found > and there's content, it's a lookup path // If we found > and there's content, it's a lookup path
if (end < input.size() && input[end] == '>' && end > pos + 1) { if (end < input.size() && input[end] == '>' && end > pos + 1) {
std::string path = input.substr(pos + 1, end - pos - 1); std::string path = input.substr(pos + 1, end - pos - 1);
size_t consumed = end - pos + 1;
tokens.push_back({Token::LOOKUP_PATH, path, line, col}); tokens.push_back({Token::LOOKUP_PATH, path, line, col});
pos = end + 1; pos = end + 1;
col += (end - pos + 1); col += consumed;
is_lookup_path = true; is_lookup_path = true;
} }
@ -123,8 +125,14 @@ std::vector<Token> Lexer::tokenize() {
} else if (c == '!') { } else if (c == '!') {
emit(TOKEN(NOT)); emit(TOKEN(NOT));
} else if (c == '.') { } else if (c == '.') {
// Relative paths: ./foo and ../foo
if (pos + 1 < input.size() && input[pos + 1] == '/') {
tokenize_path();
} else if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '/') {
tokenize_path();
}
// Check for ellipsis (...) // Check for ellipsis (...)
if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') { else if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') {
tokens.push_back(TOKEN(ELLIPSIS)); tokens.push_back(TOKEN(ELLIPSIS));
pos += 3; pos += 3;
col += 3; col += 3;
@ -176,8 +184,8 @@ std::vector<Token> Lexer::tokenize() {
tokenize_ident(); tokenize_ident();
} }
} else { } else {
pos++; throw std::runtime_error("Unexpected character '" + std::string(1, c) + "' at " +
col++; std::to_string(line) + ":" + std::to_string(col));
} }
} }
tokens.push_back({Token::EOF_, "", line, col}); tokens.push_back({Token::EOF_, "", line, col});
@ -210,10 +218,16 @@ void Lexer::skip_whitespace() {
} else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') { } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') {
// Block comment /* ... */ // Block comment /* ... */
// Note: Nix block comments do NOT nest // Note: Nix block comments do NOT nest
size_t start_line = line;
size_t start_col = col;
bool terminated = false;
pos += 2; // Skip /* pos += 2; // Skip /*
col += 2;
while (pos + 1 < input.size()) { while (pos + 1 < input.size()) {
if (input[pos] == '*' && input[pos + 1] == '/') { if (input[pos] == '*' && input[pos + 1] == '/') {
pos += 2; // Skip */ pos += 2; // Skip */
col += 2;
terminated = true;
break; break;
} }
if (input[pos] == '\n') { if (input[pos] == '\n') {
@ -224,6 +238,10 @@ void Lexer::skip_whitespace() {
} }
pos++; pos++;
} }
if (!terminated) {
throw std::runtime_error("Unterminated block comment at " + std::to_string(start_line) +
":" + std::to_string(start_col));
}
} else { } else {
break; break;
} }
@ -231,13 +249,17 @@ void Lexer::skip_whitespace() {
} }
void Lexer::tokenize_string() { void Lexer::tokenize_string() {
size_t start_line = line;
size_t start_col = col;
pos++; pos++;
col++;
std::string s; std::string s;
bool has_interp = false; bool has_interp = false;
while (pos < input.size() && input[pos] != '"') { while (pos < input.size() && input[pos] != '"') {
if (input[pos] == '\\' && pos + 1 < input.size()) { if (input[pos] == '\\' && pos + 1 < input.size()) {
pos++; pos++;
col++;
switch (input[pos]) { switch (input[pos]) {
case 'n': case 'n':
s += '\n'; s += '\n';
@ -262,21 +284,35 @@ void Lexer::tokenize_string() {
break; break;
} }
pos++; pos++;
col++;
} else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') { } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
// Found interpolation marker // Found interpolation marker
has_interp = true; has_interp = true;
s += input[pos]; // Keep $ in raw string s += input[pos]; // Keep $ in raw string
pos++; pos++;
col++;
} else { } else {
if (input[pos] == '\n') {
s += input[pos]; s += input[pos];
pos++; pos++;
line++;
col = 1;
continue;
} }
s += input[pos];
pos++;
col++;
}
}
if (pos >= input.size()) {
throw std::runtime_error("Unterminated string at " + std::to_string(start_line) + ":" +
std::to_string(start_col));
} }
pos++; pos++;
col++;
Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING; Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING;
tokens.push_back({type, s, line, col}); tokens.push_back({type, s, start_line, start_col});
col += s.size() + 2;
} }
void Lexer::tokenize_indented_string() { void Lexer::tokenize_indented_string() {

View file

@ -68,12 +68,12 @@ struct Token {
class Lexer { class Lexer {
public: public:
Lexer(const std::string& input); explicit Lexer(std::string input);
std::vector<Token> tokenize(); std::vector<Token> tokenize();
private: private:
std::vector<Token> tokens; std::vector<Token> tokens;
const std::string& input; std::string input;
size_t pos; size_t pos;
size_t line; size_t line;
size_t col; size_t col;

View file

@ -18,6 +18,116 @@ static std::string trim(const std::string& s) {
return s.substr(start, end - start + 1); return s.substr(start, end - start + 1);
} }
static const char* token_type_name(Token::Type type) {
switch (type) {
case Token::LPAREN:
return "LPAREN";
case Token::RPAREN:
return "RPAREN";
case Token::LBRACE:
return "LBRACE";
case Token::RBRACE:
return "RBRACE";
case Token::LBRACKET:
return "LBRACKET";
case Token::RBRACKET:
return "RBRACKET";
case Token::IDENT:
return "IDENT";
case Token::STRING:
return "STRING";
case Token::STRING_INTERP:
return "STRING_INTERP";
case Token::INDENTED_STRING:
return "INDENTED_STRING";
case Token::INDENTED_STRING_INTERP:
return "INDENTED_STRING_INTERP";
case Token::PATH:
return "PATH";
case Token::LOOKUP_PATH:
return "LOOKUP_PATH";
case Token::INT:
return "INT";
case Token::FLOAT:
return "FLOAT";
case Token::URI:
return "URI";
case Token::BOOL:
return "BOOL";
case Token::LET:
return "LET";
case Token::IN:
return "IN";
case Token::REC:
return "REC";
case Token::IF:
return "IF";
case Token::THEN:
return "THEN";
case Token::ELSE:
return "ELSE";
case Token::ASSERT:
return "ASSERT";
case Token::WITH:
return "WITH";
case Token::INHERIT:
return "INHERIT";
case Token::IMPORT:
return "IMPORT";
case Token::DOT:
return "DOT";
case Token::SEMICOLON:
return "SEMICOLON";
case Token::COLON:
return "COLON";
case Token::EQUALS:
return "EQUALS";
case Token::AT:
return "AT";
case Token::COMMA:
return "COMMA";
case Token::QUESTION:
return "QUESTION";
case Token::ELLIPSIS:
return "ELLIPSIS";
case Token::PLUS:
return "PLUS";
case Token::MINUS:
return "MINUS";
case Token::STAR:
return "STAR";
case Token::SLASH:
return "SLASH";
case Token::CONCAT:
return "CONCAT";
case Token::MERGE:
return "MERGE";
case Token::EQEQ:
return "EQEQ";
case Token::NE:
return "NE";
case Token::LT:
return "LT";
case Token::GT:
return "GT";
case Token::LE:
return "LE";
case Token::GE:
return "GE";
case Token::AND:
return "AND";
case Token::OR:
return "OR";
case Token::IMPL:
return "IMPL";
case Token::NOT:
return "NOT";
case Token::EOF_:
return "EOF";
}
return "UNKNOWN";
}
static std::string read_file(const std::string& path) { static std::string read_file(const std::string& path) {
FILE* f = fopen(path.c_str(), "r"); FILE* f = fopen(path.c_str(), "r");
if (!f) { if (!f) {
@ -85,14 +195,17 @@ public:
bool expect(Token::Type type) { bool expect(Token::Type type) {
if (current().type != type) { if (current().type != type) {
throw std::runtime_error( throw std::runtime_error("Expected token " + std::string(token_type_name(type)) +
"Expected token " + std::to_string(type) + " but got " + std::to_string(current().type) + " but got " + token_type_name(current().type) + " at " +
" at " + std::to_string(current().line) + ":" + std::to_string(current().col)); std::to_string(current().line) + ":" +
std::to_string(current().col));
} }
advance(); advance();
return true; return true;
} }
bool is_right_associative(Token::Type type) { return type == Token::IMPL; }
// Get operator precedence (higher = tighter binding) // Get operator precedence (higher = tighter binding)
int get_precedence(Token::Type type) { int get_precedence(Token::Type type) {
switch (type) { switch (type) {
@ -263,7 +376,8 @@ public:
Token op_token = current(); Token op_token = current();
advance(); advance();
auto right = parse_binary_op(prec + 1); int next_prec = is_right_associative(op_token.type) ? prec : prec + 1;
auto right = parse_binary_op(next_prec);
left = std::make_shared<Node>(BinaryOpNode(token_to_binop(op_token.type), left, right)); left = std::make_shared<Node>(BinaryOpNode(token_to_binop(op_token.type), left, right));
} }

View file

@ -16,8 +16,8 @@ BinaryOpNode::BinaryOpNode(BinaryOp o, std::shared_ptr<Node> l, std::shared_ptr<
: op(o), left(std::move(l)), right(std::move(r)), line(ln) {} : op(o), left(std::move(l)), right(std::move(r)), line(ln) {}
// UnaryOpNode constructor // UnaryOpNode constructor
UnaryOpNode::UnaryOpNode(UnaryOp o, std::shared_ptr<Node> operand, uint32_t l) UnaryOpNode::UnaryOpNode(UnaryOp o, std::shared_ptr<Node> operand_ptr, uint32_t l)
: op(o), operand(std::move(operand)), line(l) {} : op(o), operand(std::move(operand_ptr)), line(l) {}
// SelectNode constructor // SelectNode constructor
SelectNode::SelectNode(std::shared_ptr<Node> e, std::shared_ptr<Node> a, uint32_t l) SelectNode::SelectNode(std::shared_ptr<Node> e, std::shared_ptr<Node> a, uint32_t l)