irc: split parser into lexer and parser components

Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I4e73459a02caff5335d690656fd6f1396a6a6964
2026-04-24 14:39:51 +03:00 · 2026-04-24 14:39:51 +03:00 · 0a5920adaf
commit 0a5920adaf
parent feb247f64a
3 changed files with 666 additions and 631 deletions
--- a/src/irc/lexer.cpp
+++ b/src/irc/lexer.cpp
@ -0,0 +1,562 @@
 #include "lexer.h"
 #include <cctype>
 namespace nix_irc {
 Lexer::Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {}
 std::vector<Token> Lexer::tokenize() {
 #define TOKEN(t)                                                                                   \
  Token {                                                                                          \
    Token::t, "", line, col                                                                        \
  }
  while (pos < input.size()) {
    skip_whitespace();
    if (pos >= input.size())
      break;
    char c = input[pos];
    if (c == '(') {
      emit(TOKEN(LPAREN));
    } else if (c == ')') {
      emit(TOKEN(RPAREN));
    } else if (c == '{') {
      emit(TOKEN(LBRACE));
    } else if (c == '}') {
      emit(TOKEN(RBRACE));
    } else if (c == '[') {
      emit(TOKEN(LBRACKET));
    } else if (c == ']') {
      emit(TOKEN(RBRACKET));
    } else if (c == ';') {
      emit(TOKEN(SEMICOLON));
    } else if (c == ':') {
      emit(TOKEN(COLON));
    } else if (c == '@') {
      emit(TOKEN(AT));
    } else if (c == ',') {
      emit(TOKEN(COMMA));
    } else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') {
      tokenize_indented_string();
    } else if (c == '"') {
      tokenize_string();
    }
    // Two-char operators
    else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') {
      tokens.push_back(TOKEN(EQEQ));
      pos += 2;
      col += 2;
    } else if (c == '=') {
      emit(TOKEN(EQUALS));
    } else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') {
      tokens.push_back(TOKEN(NE));
      pos += 2;
      col += 2;
    } else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') {
      tokens.push_back(TOKEN(LE));
      pos += 2;
      col += 2;
    } else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') {
      tokens.push_back(TOKEN(GE));
      pos += 2;
      col += 2;
    } else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') {
      tokens.push_back(TOKEN(CONCAT));
      pos += 2;
      col += 2;
    } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') {
      tokens.push_back(TOKEN(MERGE));
      pos += 2;
      col += 2;
    } else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') {
      tokens.push_back(TOKEN(AND));
      pos += 2;
      col += 2;
    } else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') {
      tokens.push_back(TOKEN(OR));
      pos += 2;
      col += 2;
    } else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') {
      tokens.push_back(TOKEN(IMPL));
      pos += 2;
      col += 2;
    }
    // Single-char operators
    else if (c == '+') {
      emit(TOKEN(PLUS));
    } else if (c == '*') {
      emit(TOKEN(STAR));
    } else if (c == '/') {
      // Check if it's a path or division
      if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) {
        tokenize_path();
      } else {
        emit(TOKEN(SLASH));
      }
    } else if (c == '<') {
      // Check for lookup path <nixpkgs> vs comparison operator
      size_t end = pos + 1;
      bool is_lookup_path = false;
      // Scan for valid lookup path characters until >
      while (end < input.size() && (isalnum(input[end]) || input[end] == '-' || input[end] == '_' ||
                                    input[end] == '/' || input[end] == '.')) {
        end++;
      }
      // If we found > and there's content, it's a lookup path
      if (end < input.size() && input[end] == '>' && end > pos + 1) {
        std::string path = input.substr(pos + 1, end - pos - 1);
        tokens.push_back({Token::LOOKUP_PATH, path, line, col});
        pos = end + 1;
        col += (end - pos + 1);
        is_lookup_path = true;
      }
      if (!is_lookup_path) {
        emit(TOKEN(LT));
      }
    } else if (c == '>') {
      emit(TOKEN(GT));
    } else if (c == '!') {
      emit(TOKEN(NOT));
    } else if (c == '.') {
      // Check for ellipsis (...)
      if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') {
        tokens.push_back(TOKEN(ELLIPSIS));
        pos += 3;
        col += 3;
      } else {
        emit(TOKEN(DOT));
      }
    } else if (c == '?') {
      emit(TOKEN(QUESTION));
    } else if (c == '~') {
      // Home-relative path ~/...
      if (pos + 1 < input.size() && input[pos + 1] == '/') {
        tokenize_home_path();
      } else {
        // Just ~ by itself is an identifier
        tokenize_ident();
      }
    } else if (c == '-') {
      // Check if it's a negative number or minus operator
      if (pos + 1 < input.size() && isdigit(input[pos + 1])) {
        // Check for negative float
        if (pos + 2 < input.size() && input[pos + 2] == '.') {
          tokenize_float();
        } else {
          tokenize_int();
        }
      } else {
        emit(TOKEN(MINUS));
      }
    } else if (isdigit(c)) {
      // Check if it's a float (digit followed by '.')
      if (pos + 1 < input.size() && input[pos + 1] == '.') {
        tokenize_float();
      } else {
        tokenize_int();
      }
    } else if (isalpha(c)) {
      // Check if it's a URI (contains ://) - look ahead
      size_t lookahead = pos;
      while (lookahead < input.size() &&
             (isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' ||
              input[lookahead] == '+' || input[lookahead] == '.'))
        lookahead++;
      std::string potential_scheme = input.substr(pos, lookahead - pos);
      if (lookahead + 2 < input.size() && input[lookahead] == ':' && input[lookahead + 1] == '/' &&
          input[lookahead + 2] == '/') {
        // It's a URI, consume the whole thing
        tokenize_uri();
      } else {
        tokenize_ident();
      }
    } else {
      pos++;
      col++;
    }
  }
  tokens.push_back({Token::EOF_, "", line, col});
 #undef TOKEN
  return tokens;
 }
 void Lexer::emit(const Token& t) {
  tokens.push_back(t);
  pos++;
  col++;
 }
 void Lexer::skip_whitespace() {
  while (pos < input.size()) {
    char c = input[pos];
    if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
      if (c == '\n') {
        line++;
        col = 1;
      } else {
        col++;
      }
      pos++;
    } else if (c == '#') {
      // Line comment - skip until newline
      while (pos < input.size() && input[pos] != '\n')
        pos++;
    } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') {
      // Block comment /* ... */
      // Note: Nix block comments do NOT nest
      pos += 2; // Skip /*
      while (pos + 1 < input.size()) {
        if (input[pos] == '*' && input[pos + 1] == '/') {
          pos += 2; // Skip */
          break;
        }
        if (input[pos] == '\n') {
          line++;
          col = 1;
        } else {
          col++;
        }
        pos++;
      }
    } else {
      break;
    }
  }
 }
 void Lexer::tokenize_string() {
  pos++;
  std::string s;
  bool has_interp = false;
  while (pos < input.size() && input[pos] != '"') {
    if (input[pos] == '\\' && pos + 1 < input.size()) {
      pos++;
      switch (input[pos]) {
      case 'n':
        s += '\n';
        break;
      case 't':
        s += '\t';
        break;
      case 'r':
        s += '\r';
        break;
      case '"':
        s += '"';
        break;
      case '\\':
        s += '\\';
        break;
      case '$':
        s += '$';
        break; // Escaped $
      default:
        s += input[pos];
        break;
      }
      pos++;
    } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
      // Found interpolation marker
      has_interp = true;
      s += input[pos]; // Keep $ in raw string
      pos++;
    } else {
      s += input[pos];
      pos++;
    }
  }
  pos++;
  Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING;
  tokens.push_back({type, s, line, col});
  col += s.size() + 2;
 }
 void Lexer::tokenize_indented_string() {
  pos += 2; // Skip opening ''
  std::string raw_content;
  bool has_interp = false;
  size_t start_line = line;
  // Collect raw content until closing ''
  while (pos < input.size()) {
    // Check for escape sequences
    if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') {
      // Check if it's an escape or the closing delimiter
      if (pos + 2 < input.size() && input[pos + 2] == '\'') {
        // '''  -> escape for ''
        raw_content += "''";
        pos += 3;
        continue;
      } else if (pos + 2 < input.size() && input[pos + 2] == '$') {
        // ''$ -> escape for $
        raw_content += '$';
        pos += 3;
        continue;
      } else if (pos + 2 < input.size() && input[pos + 2] == '\\') {
        // ''\ -> check what follows
        if (pos + 3 < input.size()) {
          char next = input[pos + 3];
          if (next == 'n') {
            raw_content += '\n';
            pos += 4;
            continue;
          } else if (next == 'r') {
            raw_content += '\r';
            pos += 4;
            continue;
          } else if (next == 't') {
            raw_content += '\t';
            pos += 4;
            continue;
          } else if (next == ' ' || next == '\t') {
            // ''\ before whitespace - preserve the whitespace by prepending a marker
            // We use a special escape sequence that won't appear in normal text
            raw_content += "\x1F\x1F"; // Unit separator pair as marker for preserved whitespace
            raw_content += next;
            pos += 4;
            continue;
          }
        }
        // Default: literal backslash
        raw_content += '\\';
        pos += 3;
        continue;
      } else {
        // Just closing ''
        pos += 2;
        break;
      }
    }
    // Check for interpolation
    if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
      has_interp = true;
      raw_content += input[pos];
      pos++;
      if (input[pos] == '\n') {
        line++;
      }
      continue;
    }
    // Track newlines
    if (input[pos] == '\n') {
      line++;
      raw_content += input[pos];
      pos++;
    } else {
      raw_content += input[pos];
      pos++;
    }
  }
  // Strip common indentation
  std::string stripped = strip_indentation(raw_content);
  Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING;
  tokens.push_back({type, stripped, start_line, col});
 }
 std::string Lexer::strip_indentation(const std::string& s) {
  if (s.empty())
    return s;
  // Split into lines
  std::vector<std::string> lines;
  std::string current_line;
  for (char c : s) {
    if (c == '\n') {
      lines.push_back(current_line);
      current_line.clear();
    } else {
      current_line += c;
    }
  }
  if (!current_line.empty() || (!s.empty() && s.back() == '\n')) {
    lines.push_back(current_line);
  }
  // Find minimum indentation (spaces/tabs at start of non-empty lines)
  // \x1F\x1F marker indicates preserved whitespace (from ''\ escape)
  size_t min_indent = std::string::npos;
  for (const auto& line : lines) {
    if (line.empty())
      continue; // Skip empty lines when calculating indentation
    size_t indent = 0;
    for (size_t i = 0; i < line.size(); i++) {
      char c = line[i];
      // If we hit the preserved whitespace marker, stop counting indentation
      if (c == '\x1F' && i + 1 < line.size() && line[i + 1] == '\x1F') {
        break;
      }
      if (c == ' ' || c == '\t')
        indent++;
      else
        break;
    }
    if (indent < min_indent)
      min_indent = indent;
  }
  if (min_indent == std::string::npos)
    min_indent = 0;
  // Strip min_indent from all lines and remove \x1F\x1F markers
  std::string result;
  for (size_t i = 0; i < lines.size(); i++) {
    const auto& line = lines[i];
    if (line.empty()) {
      // Preserve empty lines
      if (i + 1 < lines.size())
        result += '\n';
    } else {
      // Strip indentation, being careful about \x1F\x1F markers
      size_t skip = 0;
      size_t pos = 0;
      while (skip < min_indent && pos < line.size()) {
        if (line[pos] == '\x1F' && pos + 1 < line.size() && line[pos + 1] == '\x1F') {
          // Hit preserved whitespace marker - don't strip any more
          break;
        }
        skip++;
        pos++;
      }
      // Add the rest of the line, removing \x1F\x1F markers
      for (size_t j = pos; j < line.size(); j++) {
        if (line[j] == '\x1F' && j + 1 < line.size() && line[j + 1] == '\x1F') {
          j++; // Skip both marker bytes
          continue;
        }
        result += line[j];
      }
      if (i + 1 < lines.size())
        result += '\n';
    }
  }
  return result;
 }
 void Lexer::tokenize_path() {
  size_t start = pos;
  while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
         input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
         input[pos] != ';') {
    pos++;
  }
  std::string path = input.substr(start, pos - start);
  tokens.push_back({Token::PATH, path, line, col});
  col += path.size();
 }
 void Lexer::tokenize_home_path() {
  size_t start = pos;
  pos++; // Skip ~
  if (pos < input.size() && input[pos] == '/') {
    // Home-relative path ~/something
    while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
           input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
           input[pos] != ';') {
      pos++;
    }
  }
  std::string path = input.substr(start, pos - start);
  tokens.push_back({Token::PATH, path, line, col});
  col += path.size();
 }
 void Lexer::tokenize_int() {
  size_t start = pos;
  if (input[pos] == '-')
    pos++;
  while (pos < input.size() && isdigit(input[pos]))
    pos++;
  std::string num = input.substr(start, pos - start);
  tokens.push_back({Token::INT, num, line, col});
  col += num.size();
 }
 void Lexer::tokenize_float() {
  size_t start = pos;
  if (input[pos] == '-')
    pos++;
  while (pos < input.size() && isdigit(input[pos]))
    pos++;
  if (pos < input.size() && input[pos] == '.') {
    pos++;
    while (pos < input.size() && isdigit(input[pos]))
      pos++;
  }
  std::string num = input.substr(start, pos - start);
  tokens.push_back({Token::FLOAT, num, line, col});
  col += num.size();
 }
 void Lexer::tokenize_uri() {
  size_t start = pos;
  while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' &&
         input[pos] != ';') {
    pos++;
  }
  std::string uri = input.substr(start, pos - start);
  tokens.push_back({Token::URI, uri, line, col});
  col += uri.size();
 }
 void Lexer::tokenize_ident() {
  size_t start = pos;
  // Note: Don't include '.' here - it's used for selection (a.b.c)
  // URIs are handled separately by checking for '://' pattern
  while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-'))
    pos++;
  std::string ident = input.substr(start, pos - start);
  // Check if it's a URI (contains ://)
  size_t scheme_end = ident.find("://");
  if (scheme_end != std::string::npos && scheme_end > 0) {
    tokens.push_back({Token::URI, ident, line, col});
    col += ident.size();
    return;
  }
  Token::Type type = Token::IDENT;
  if (ident == "let")
    type = Token::LET;
  else if (ident == "in")
    type = Token::IN;
  else if (ident == "rec")
    type = Token::REC;
  else if (ident == "if")
    type = Token::IF;
  else if (ident == "then")
    type = Token::THEN;
  else if (ident == "else")
    type = Token::ELSE;
  else if (ident == "assert")
    type = Token::ASSERT;
  else if (ident == "with")
    type = Token::WITH;
  else if (ident == "inherit")
    type = Token::INHERIT;
  else if (ident == "import")
    type = Token::IMPORT;
  else if (ident == "true")
    type = Token::BOOL;
  else if (ident == "false")
    type = Token::BOOL;
  tokens.push_back({type, ident, line, col});
  col += ident.size();
 }
 } // namespace nix_irc
--- a/src/irc/lexer.h
+++ b/src/irc/lexer.h
@ -0,0 +1,94 @@
 #pragma once
 #include <string>
 #include <vector>
 namespace nix_irc {
 struct Token {
  enum Type {
    LPAREN,
    RPAREN,
    LBRACE,
    RBRACE,
    LBRACKET,
    RBRACKET,
    IDENT,
    STRING,
    STRING_INTERP,
    INDENTED_STRING,
    INDENTED_STRING_INTERP,
    PATH,
    LOOKUP_PATH,
    INT,
    FLOAT,
    URI,
    BOOL,
    LET,
    IN,
    REC,
    IF,
    THEN,
    ELSE,
    ASSERT,
    WITH,
    INHERIT,
    IMPORT,
    DOT,
    SEMICOLON,
    COLON,
    EQUALS,
    AT,
    COMMA,
    QUESTION,
    ELLIPSIS,
    // Operators
    PLUS,
    MINUS,
    STAR,
    SLASH,
    CONCAT,
    MERGE,
    EQEQ,
    NE,
    LT,
    GT,
    LE,
    GE,
    AND,
    OR,
    IMPL,
    NOT,
    EOF_
  } type;
  std::string value;
  size_t line;
  size_t col;
 };
 class Lexer {
 public:
  Lexer(const std::string& input);
  std::vector<Token> tokenize();
 private:
  std::vector<Token> tokens;
  const std::string& input;
  size_t pos;
  size_t line;
  size_t col;
  void emit(const Token& t);
  void skip_whitespace();
  void tokenize_string();
  void tokenize_indented_string();
  std::string strip_indentation(const std::string& s);
  void tokenize_path();
  void tokenize_home_path();
  void tokenize_int();
  void tokenize_float();
  void tokenize_uri();
  void tokenize_ident();
 };
 } // namespace nix_irc
--- a/src/irc/parser.cpp
+++ b/src/irc/parser.cpp
@ -1,4 +1,5 @@
 #include "parser.h"
 #include "lexer.h"
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@ -59,628 +60,6 @@ static std::pair<std::string, std::string> run_command(const std::string& cmd) {
  return {result, ""};
 }
 struct Token {
  enum Type {
    LPAREN,
    RPAREN,
    LBRACE,
    RBRACE,
    LBRACKET,
    RBRACKET,
    IDENT,
    STRING,
    STRING_INTERP,
    INDENTED_STRING,
    INDENTED_STRING_INTERP,
    PATH,
    LOOKUP_PATH,
    INT,
    FLOAT,
    URI,
    BOOL,
    LET,
    IN,
    REC,
    IF,
    THEN,
    ELSE,
    ASSERT,
    WITH,
    INHERIT,
    IMPORT,
    DOT,
    SEMICOLON,
    COLON,
    EQUALS,
    AT,
    COMMA,
    QUESTION,
    ELLIPSIS,
    // Operators
    PLUS,
    MINUS,
    STAR,
    SLASH,
    CONCAT,
    MERGE,
    EQEQ,
    NE,
    LT,
    GT,
    LE,
    GE,
    AND,
    OR,
    IMPL,
    NOT,
    EOF_
  } type;
  std::string value;
  size_t line;
  size_t col;
 };
 class Lexer {
 public:
  Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {}
  std::vector<Token> tokenize() {
 #define TOKEN(t)                                                                                   \
  Token { Token::t, "", line, col }
    while (pos < input.size()) {
      skip_whitespace();
      if (pos >= input.size())
        break;
      char c = input[pos];
      if (c == '(') {
        emit(TOKEN(LPAREN));
      } else if (c == ')') {
        emit(TOKEN(RPAREN));
      } else if (c == '{') {
        emit(TOKEN(LBRACE));
      } else if (c == '}') {
        emit(TOKEN(RBRACE));
      } else if (c == '[') {
        emit(TOKEN(LBRACKET));
      } else if (c == ']') {
        emit(TOKEN(RBRACKET));
      } else if (c == ';') {
        emit(TOKEN(SEMICOLON));
      } else if (c == ':') {
        emit(TOKEN(COLON));
      } else if (c == '@') {
        emit(TOKEN(AT));
      } else if (c == ',') {
        emit(TOKEN(COMMA));
      } else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') {
        tokenize_indented_string();
      } else if (c == '"') {
        tokenize_string();
      }
      // Two-char operators
      else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') {
        tokens.push_back(TOKEN(EQEQ));
        pos += 2;
        col += 2;
      } else if (c == '=') {
        emit(TOKEN(EQUALS));
      } else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') {
        tokens.push_back(TOKEN(NE));
        pos += 2;
        col += 2;
      } else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') {
        tokens.push_back(TOKEN(LE));
        pos += 2;
        col += 2;
      } else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') {
        tokens.push_back(TOKEN(GE));
        pos += 2;
        col += 2;
      } else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') {
        tokens.push_back(TOKEN(CONCAT));
        pos += 2;
        col += 2;
      } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') {
        tokens.push_back(TOKEN(MERGE));
        pos += 2;
        col += 2;
      } else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') {
        tokens.push_back(TOKEN(AND));
        pos += 2;
        col += 2;
      } else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') {
        tokens.push_back(TOKEN(OR));
        pos += 2;
        col += 2;
      } else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') {
        tokens.push_back(TOKEN(IMPL));
        pos += 2;
        col += 2;
      }
      // Single-char operators
      else if (c == '+') {
        emit(TOKEN(PLUS));
      } else if (c == '*') {
        emit(TOKEN(STAR));
      } else if (c == '/') {
        // Check if it's a path or division
        if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) {
          tokenize_path();
        } else {
          emit(TOKEN(SLASH));
        }
      } else if (c == '<') {
        // Check for lookup path <nixpkgs> vs comparison operator
        size_t end = pos + 1;
        bool is_lookup_path = false;
        // Scan for valid lookup path characters until >
        while (end < input.size() &&
               (isalnum(input[end]) || input[end] == '-' || input[end] == '_' ||
                input[end] == '/' || input[end] == '.')) {
          end++;
        }
        // If we found > and there's content, it's a lookup path
        if (end < input.size() && input[end] == '>' && end > pos + 1) {
          std::string path = input.substr(pos + 1, end - pos - 1);
          tokens.push_back({Token::LOOKUP_PATH, path, line, col});
          pos = end + 1;
          col += (end - pos + 1);
          is_lookup_path = true;
        }
        if (!is_lookup_path) {
          emit(TOKEN(LT));
        }
      } else if (c == '>') {
        emit(TOKEN(GT));
      } else if (c == '!') {
        emit(TOKEN(NOT));
      } else if (c == '.') {
        // Check for ellipsis (...)
        if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') {
          tokens.push_back(TOKEN(ELLIPSIS));
          pos += 3;
          col += 3;
        } else {
          emit(TOKEN(DOT));
        }
      } else if (c == '?') {
        emit(TOKEN(QUESTION));
      } else if (c == '~') {
        // Home-relative path ~/...
        if (pos + 1 < input.size() && input[pos + 1] == '/') {
          tokenize_home_path();
        } else {
          // Just ~ by itself is an identifier
          tokenize_ident();
        }
      } else if (c == '-') {
        // Check if it's a negative number or minus operator
        if (pos + 1 < input.size() && isdigit(input[pos + 1])) {
          // Check for negative float
          if (pos + 2 < input.size() && input[pos + 2] == '.') {
            tokenize_float();
          } else {
            tokenize_int();
          }
        } else {
          emit(TOKEN(MINUS));
        }
      } else if (isdigit(c)) {
        // Check if it's a float (digit followed by '.')
        if (pos + 1 < input.size() && input[pos + 1] == '.') {
          tokenize_float();
        } else {
          tokenize_int();
        }
      } else if (isalpha(c)) {
        // Check if it's a URI (contains ://) - look ahead
        size_t lookahead = pos;
        while (lookahead < input.size() &&
               (isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' ||
                input[lookahead] == '+' || input[lookahead] == '.'))
          lookahead++;
        std::string potential_scheme = input.substr(pos, lookahead - pos);
        if (lookahead + 2 < input.size() && input[lookahead] == ':' &&
            input[lookahead + 1] == '/' && input[lookahead + 2] == '/') {
          // It's a URI, consume the whole thing
          tokenize_uri();
        } else {
          tokenize_ident();
        }
      } else {
        pos++;
        col++;
      }
    }
    tokens.push_back({Token::EOF_, "", line, col});
 #undef TOKEN
    return tokens;
  }
 private:
  std::vector<Token> tokens;
  const std::string& input;
  size_t pos;
  size_t line;
  size_t col;
  void emit(const Token& t) {
    tokens.push_back(t);
    pos++;
    col++;
  }
  void skip_whitespace() {
    while (pos < input.size()) {
      char c = input[pos];
      if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
        if (c == '\n') {
          line++;
          col = 1;
        } else {
          col++;
        }
        pos++;
      } else if (c == '#') {
        // Line comment - skip until newline
        while (pos < input.size() && input[pos] != '\n')
          pos++;
      } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') {
        // Block comment /* ... */
        // Note: Nix block comments do NOT nest
        pos += 2; // Skip /*
        while (pos + 1 < input.size()) {
          if (input[pos] == '*' && input[pos + 1] == '/') {
            pos += 2; // Skip */
            break;
          }
          if (input[pos] == '\n') {
            line++;
            col = 1;
          } else {
            col++;
          }
          pos++;
        }
      } else {
        break;
      }
    }
  }
  void tokenize_string() {
    pos++;
    std::string s;
    bool has_interp = false;
    while (pos < input.size() && input[pos] != '"') {
      if (input[pos] == '\\' && pos + 1 < input.size()) {
        pos++;
        switch (input[pos]) {
        case 'n':
          s += '\n';
          break;
        case 't':
          s += '\t';
          break;
        case 'r':
          s += '\r';
          break;
        case '"':
          s += '"';
          break;
        case '\\':
          s += '\\';
          break;
        case '$':
          s += '$';
          break; // Escaped $
        default:
          s += input[pos];
          break;
        }
        pos++;
      } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
        // Found interpolation marker
        has_interp = true;
        s += input[pos]; // Keep $ in raw string
        pos++;
      } else {
        s += input[pos];
        pos++;
      }
    }
    pos++;
    Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING;
    tokens.push_back({type, s, line, col});
    col += s.size() + 2;
  }
  void tokenize_indented_string() {
    pos += 2; // Skip opening ''
    std::string raw_content;
    bool has_interp = false;
    size_t start_line = line;
    // Collect raw content until closing ''
    while (pos < input.size()) {
      // Check for escape sequences
      if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') {
        // Check if it's an escape or the closing delimiter
        if (pos + 2 < input.size() && input[pos + 2] == '\'') {
          // '''  -> escape for ''
          raw_content += "''";
          pos += 3;
          continue;
        } else if (pos + 2 < input.size() && input[pos + 2] == '$') {
          // ''$ -> escape for $
          raw_content += '$';
          pos += 3;
          continue;
        } else if (pos + 2 < input.size() && input[pos + 2] == '\\') {
          // ''\ -> check what follows
          if (pos + 3 < input.size()) {
            char next = input[pos + 3];
            if (next == 'n') {
              raw_content += '\n';
              pos += 4;
              continue;
            } else if (next == 'r') {
              raw_content += '\r';
              pos += 4;
              continue;
            } else if (next == 't') {
              raw_content += '\t';
              pos += 4;
              continue;
            } else if (next == ' ' || next == '\t') {
              // ''\ before whitespace - preserve the whitespace (mark it specially)
              raw_content += "\x01"; // Use control char as marker for preserved whitespace
              raw_content += next;
              pos += 4;
              continue;
            }
          }
          // Default: literal backslash
          raw_content += '\\';
          pos += 3;
          continue;
        } else {
          // Just closing ''
          pos += 2;
          break;
        }
      }
      // Check for interpolation
      if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
        has_interp = true;
        raw_content += input[pos];
        pos++;
        if (input[pos] == '\n') {
          line++;
        }
        continue;
      }
      // Track newlines
      if (input[pos] == '\n') {
        line++;
        raw_content += input[pos];
        pos++;
      } else {
        raw_content += input[pos];
        pos++;
      }
    }
    // Strip common indentation
    std::string stripped = strip_indentation(raw_content);
    Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING;
    tokens.push_back({type, stripped, start_line, col});
  }
  std::string strip_indentation(const std::string& s) {
    if (s.empty())
      return s;
    // Split into lines
    std::vector<std::string> lines;
    std::string current_line;
    for (char c : s) {
      if (c == '\n') {
        lines.push_back(current_line);
        current_line.clear();
      } else {
        current_line += c;
      }
    }
    if (!current_line.empty() || (!s.empty() && s.back() == '\n')) {
      lines.push_back(current_line);
    }
    // Find minimum indentation (spaces/tabs at start of non-empty lines)
    // \x01 marker indicates preserved whitespace (from ''\ escape)
    size_t min_indent = std::string::npos;
    for (const auto& line : lines) {
      if (line.empty())
        continue; // Skip empty lines when calculating indentation
      size_t indent = 0;
      for (size_t i = 0; i < line.size(); i++) {
        char c = line[i];
        // If we hit the preserved whitespace marker, stop counting indentation
        if (c == '\x01')
          break;
        if (c == ' ' || c == '\t')
          indent++;
        else
          break;
      }
      if (indent < min_indent)
        min_indent = indent;
    }
    if (min_indent == std::string::npos)
      min_indent = 0;
    // Strip min_indent from all lines and remove \x01 markers
    std::string result;
    for (size_t i = 0; i < lines.size(); i++) {
      const auto& line = lines[i];
      if (line.empty()) {
        // Preserve empty lines
        if (i + 1 < lines.size())
          result += '\n';
      } else {
        // Strip indentation, being careful about \x01 markers
        size_t skip = 0;
        size_t pos = 0;
        while (skip < min_indent && pos < line.size()) {
          if (line[pos] == '\x01') {
            // Hit preserved whitespace marker - don't strip any more
            break;
          }
          skip++;
          pos++;
        }
        // Add the rest of the line, removing \x01 markers
        for (size_t j = pos; j < line.size(); j++) {
          if (line[j] != '\x01') {
            result += line[j];
          }
        }
        if (i + 1 < lines.size())
          result += '\n';
      }
    }
    return result;
  }
  void tokenize_path() {
    size_t start = pos;
    while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
           input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
           input[pos] != ';') {
      pos++;
    }
    std::string path = input.substr(start, pos - start);
    tokens.push_back({Token::PATH, path, line, col});
    col += path.size();
  }
  void tokenize_home_path() {
    size_t start = pos;
    pos++; // Skip ~
    if (pos < input.size() && input[pos] == '/') {
      // Home-relative path ~/something
      while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
             input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
             input[pos] != ';') {
        pos++;
      }
    }
    std::string path = input.substr(start, pos - start);
    tokens.push_back({Token::PATH, path, line, col});
    col += path.size();
  }
  void tokenize_int() {
    size_t start = pos;
    if (input[pos] == '-')
      pos++;
    while (pos < input.size() && isdigit(input[pos]))
      pos++;
    std::string num = input.substr(start, pos - start);
    tokens.push_back({Token::INT, num, line, col});
    col += num.size();
  }
  void tokenize_float() {
    size_t start = pos;
    if (input[pos] == '-')
      pos++;
    while (pos < input.size() && isdigit(input[pos]))
      pos++;
    if (pos < input.size() && input[pos] == '.') {
      pos++;
      while (pos < input.size() && isdigit(input[pos]))
        pos++;
    }
    std::string num = input.substr(start, pos - start);
    tokens.push_back({Token::FLOAT, num, line, col});
    col += num.size();
  }
  void tokenize_uri() {
    size_t start = pos;
    while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' &&
           input[pos] != ';') {
      pos++;
    }
    std::string uri = input.substr(start, pos - start);
    tokens.push_back({Token::URI, uri, line, col});
    col += uri.size();
  }
  void tokenize_ident() {
    size_t start = pos;
    // Note: Don't include '.' here - it's used for selection (a.b.c)
    // URIs are handled separately by checking for '://' pattern
    while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-'))
      pos++;
    std::string ident = input.substr(start, pos - start);
    // Check if it's a URI (contains ://)
    size_t scheme_end = ident.find("://");
    if (scheme_end != std::string::npos && scheme_end > 0) {
      tokens.push_back({Token::URI, ident, line, col});
      col += ident.size();
      return;
    }
    Token::Type type = Token::IDENT;
    if (ident == "let")
      type = Token::LET;
    else if (ident == "in")
      type = Token::IN;
    else if (ident == "rec")
      type = Token::REC;
    else if (ident == "if")
      type = Token::IF;
    else if (ident == "then")
      type = Token::THEN;
    else if (ident == "else")
      type = Token::ELSE;
    else if (ident == "assert")
      type = Token::ASSERT;
    else if (ident == "with")
      type = Token::WITH;
    else if (ident == "inherit")
      type = Token::INHERIT;
    else if (ident == "import")
      type = Token::IMPORT;
    else if (ident == "true")
      type = Token::BOOL;
    else if (ident == "false")
      type = Token::BOOL;
    tokens.push_back({type, ident, line, col});
    col += ident.size();
  }
 };
 class Parser::Impl {
 public:
  std::vector<Token> tokens;
@ -706,9 +85,9 @@ public:
  bool expect(Token::Type type) {
    if (current().type != type) {
-      std::cerr << "Expected token " << type << " but got " << current().type << " at "
+      throw std::runtime_error(
-                << current().line << ":" << current().col << "\n";
+          "Expected token " + std::to_string(type) + " but got " + std::to_string(current().type) +
-      return false;
+          " at " + std::to_string(current().line) + ":" + std::to_string(current().col));
    }
    advance();
    return true;
@ -718,7 +97,7 @@ public:
  int get_precedence(Token::Type type) {
    switch (type) {
    case Token::MERGE:
-      return 1; // Low precedence - binds loosely, but must be > 0 to be recognized as binary op
+      return 1; // low precedence - binds loosely, but must be > 0 to be recognized as binary op
    case Token::OR:
      return 1;
    case Token::AND:
@ -942,9 +321,10 @@ public:
      } else if (current().type == Token::IDENT || current().type == Token::INT ||
                 current().type == Token::FLOAT || current().type == Token::BOOL ||
                 current().type == Token::PATH || current().type == Token::LOOKUP_PATH ||
-                 current().type == Token::URI || current().type == Token::LBRACKET) {
+                 current().type == Token::URI || current().type == Token::LBRACKET ||
                 current().type == Token::LBRACE) {
        // Juxtaposition application: f x
-        // Parse the argument as a primary expression (which handles lists, etc.)
+        // Parse the argument as a primary expression (which handles lists, attrsets, etc.)
        auto arg = parse_expr3();
        left = std::make_shared<Node>(AppNode(left, arg));
      } else {
@ -1056,9 +436,8 @@ public:
      return std::make_shared<Node>(ConstBoolNode(t.value == "true"));
    }
-    std::cerr << "Unknown token: " << t.value << " (type " << t.type << ")\n";
+    throw std::runtime_error("Unknown token: " + t.value + " (type " + std::to_string(t.type) +
-    advance();
+                             ")");
    return std::make_shared<Node>(ConstNullNode());
  }
  std::shared_ptr<Node> parse_attrs() {