irc: split parser into lexer and parser components

Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I4e73459a02caff5335d690656fd6f1396a6a6964
2026-04-24 14:39:51 +03:00 · 2026-04-24 14:39:51 +03:00 · 0a5920adaf
commit 0a5920adaf
parent feb247f64a
3 changed files with 666 additions and 631 deletions
--- a/src/irc/lexer.cpp
+++ b/src/irc/lexer.cpp
@ -0,0 +1,562 @@
+#include "lexer.h"
+#include <cctype>
+
+namespace nix_irc {
+
+Lexer::Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {}
+
+std::vector<Token> Lexer::tokenize() {
+#define TOKEN(t)                                                                                   \
+  Token {                                                                                          \
+    Token::t, "", line, col                                                                        \
+  }
+
+  while (pos < input.size()) {
+    skip_whitespace();
+    if (pos >= input.size())
+      break;
+
+    char c = input[pos];
+
+    if (c == '(') {
+      emit(TOKEN(LPAREN));
+    } else if (c == ')') {
+      emit(TOKEN(RPAREN));
+    } else if (c == '{') {
+      emit(TOKEN(LBRACE));
+    } else if (c == '}') {
+      emit(TOKEN(RBRACE));
+    } else if (c == '[') {
+      emit(TOKEN(LBRACKET));
+    } else if (c == ']') {
+      emit(TOKEN(RBRACKET));
+    } else if (c == ';') {
+      emit(TOKEN(SEMICOLON));
+    } else if (c == ':') {
+      emit(TOKEN(COLON));
+    } else if (c == '@') {
+      emit(TOKEN(AT));
+    } else if (c == ',') {
+      emit(TOKEN(COMMA));
+    } else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') {
+      tokenize_indented_string();
+    } else if (c == '"') {
+      tokenize_string();
+    }
+    // Two-char operators
+    else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') {
+      tokens.push_back(TOKEN(EQEQ));
+      pos += 2;
+      col += 2;
+    } else if (c == '=') {
+      emit(TOKEN(EQUALS));
+    } else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') {
+      tokens.push_back(TOKEN(NE));
+      pos += 2;
+      col += 2;
+    } else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') {
+      tokens.push_back(TOKEN(LE));
+      pos += 2;
+      col += 2;
+    } else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') {
+      tokens.push_back(TOKEN(GE));
+      pos += 2;
+      col += 2;
+    } else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') {
+      tokens.push_back(TOKEN(CONCAT));
+      pos += 2;
+      col += 2;
+    } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') {
+      tokens.push_back(TOKEN(MERGE));
+      pos += 2;
+      col += 2;
+    } else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') {
+      tokens.push_back(TOKEN(AND));
+      pos += 2;
+      col += 2;
+    } else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') {
+      tokens.push_back(TOKEN(OR));
+      pos += 2;
+      col += 2;
+    } else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') {
+      tokens.push_back(TOKEN(IMPL));
+      pos += 2;
+      col += 2;
+    }
+    // Single-char operators
+    else if (c == '+') {
+      emit(TOKEN(PLUS));
+    } else if (c == '*') {
+      emit(TOKEN(STAR));
+    } else if (c == '/') {
+      // Check if it's a path or division
+      if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) {
+        tokenize_path();
+      } else {
+        emit(TOKEN(SLASH));
+      }
+    } else if (c == '<') {
+      // Check for lookup path <nixpkgs> vs comparison operator
+      size_t end = pos + 1;
+      bool is_lookup_path = false;
+
+      // Scan for valid lookup path characters until >
+      while (end < input.size() && (isalnum(input[end]) || input[end] == '-' || input[end] == '_' ||
+                                    input[end] == '/' || input[end] == '.')) {
+        end++;
+      }
+
+      // If we found > and there's content, it's a lookup path
+      if (end < input.size() && input[end] == '>' && end > pos + 1) {
+        std::string path = input.substr(pos + 1, end - pos - 1);
+        tokens.push_back({Token::LOOKUP_PATH, path, line, col});
+        pos = end + 1;
+        col += (end - pos + 1);
+        is_lookup_path = true;
+      }
+
+      if (!is_lookup_path) {
+        emit(TOKEN(LT));
+      }
+    } else if (c == '>') {
+      emit(TOKEN(GT));
+    } else if (c == '!') {
+      emit(TOKEN(NOT));
+    } else if (c == '.') {
+      // Check for ellipsis (...)
+      if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') {
+        tokens.push_back(TOKEN(ELLIPSIS));
+        pos += 3;
+        col += 3;
+      } else {
+        emit(TOKEN(DOT));
+      }
+    } else if (c == '?') {
+      emit(TOKEN(QUESTION));
+    } else if (c == '~') {
+      // Home-relative path ~/...
+      if (pos + 1 < input.size() && input[pos + 1] == '/') {
+        tokenize_home_path();
+      } else {
+        // Just ~ by itself is an identifier
+        tokenize_ident();
+      }
+    } else if (c == '-') {
+      // Check if it's a negative number or minus operator
+      if (pos + 1 < input.size() && isdigit(input[pos + 1])) {
+        // Check for negative float
+        if (pos + 2 < input.size() && input[pos + 2] == '.') {
+          tokenize_float();
+        } else {
+          tokenize_int();
+        }
+      } else {
+        emit(TOKEN(MINUS));
+      }
+    } else if (isdigit(c)) {
+      // Check if it's a float (digit followed by '.')
+      if (pos + 1 < input.size() && input[pos + 1] == '.') {
+        tokenize_float();
+      } else {
+        tokenize_int();
+      }
+    } else if (isalpha(c)) {
+      // Check if it's a URI (contains ://) - look ahead
+      size_t lookahead = pos;
+      while (lookahead < input.size() &&
+             (isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' ||
+              input[lookahead] == '+' || input[lookahead] == '.'))
+        lookahead++;
+      std::string potential_scheme = input.substr(pos, lookahead - pos);
+      if (lookahead + 2 < input.size() && input[lookahead] == ':' && input[lookahead + 1] == '/' &&
+          input[lookahead + 2] == '/') {
+        // It's a URI, consume the whole thing
+        tokenize_uri();
+      } else {
+        tokenize_ident();
+      }
+    } else {
+      pos++;
+      col++;
+    }
+  }
+  tokens.push_back({Token::EOF_, "", line, col});
+
+#undef TOKEN
+  return tokens;
+}
+
+void Lexer::emit(const Token& t) {
+  tokens.push_back(t);
+  pos++;
+  col++;
+}
+
+void Lexer::skip_whitespace() {
+  while (pos < input.size()) {
+    char c = input[pos];
+    if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
+      if (c == '\n') {
+        line++;
+        col = 1;
+      } else {
+        col++;
+      }
+      pos++;
+    } else if (c == '#') {
+      // Line comment - skip until newline
+      while (pos < input.size() && input[pos] != '\n')
+        pos++;
+    } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') {
+      // Block comment /* ... */
+      // Note: Nix block comments do NOT nest
+      pos += 2; // Skip /*
+      while (pos + 1 < input.size()) {
+        if (input[pos] == '*' && input[pos + 1] == '/') {
+          pos += 2; // Skip */
+          break;
+        }
+        if (input[pos] == '\n') {
+          line++;
+          col = 1;
+        } else {
+          col++;
+        }
+        pos++;
+      }
+    } else {
+      break;
+    }
+  }
+}
+
+void Lexer::tokenize_string() {
+  pos++;
+  std::string s;
+  bool has_interp = false;
+
+  while (pos < input.size() && input[pos] != '"') {
+    if (input[pos] == '\\' && pos + 1 < input.size()) {
+      pos++;
+      switch (input[pos]) {
+      case 'n':
+        s += '\n';
+        break;
+      case 't':
+        s += '\t';
+        break;
+      case 'r':
+        s += '\r';
+        break;
+      case '"':
+        s += '"';
+        break;
+      case '\\':
+        s += '\\';
+        break;
+      case '$':
+        s += '$';
+        break; // Escaped $
+      default:
+        s += input[pos];
+        break;
+      }
+      pos++;
+    } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
+      // Found interpolation marker
+      has_interp = true;
+      s += input[pos]; // Keep $ in raw string
+      pos++;
+    } else {
+      s += input[pos];
+      pos++;
+    }
+  }
+  pos++;
+
+  Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING;
+  tokens.push_back({type, s, line, col});
+  col += s.size() + 2;
+}
+
+void Lexer::tokenize_indented_string() {
+  pos += 2; // Skip opening ''
+  std::string raw_content;
+  bool has_interp = false;
+  size_t start_line = line;
+
+  // Collect raw content until closing ''
+  while (pos < input.size()) {
+    // Check for escape sequences
+    if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') {
+      // Check if it's an escape or the closing delimiter
+      if (pos + 2 < input.size() && input[pos + 2] == '\'') {
+        // '''  -> escape for ''
+        raw_content += "''";
+        pos += 3;
+        continue;
+      } else if (pos + 2 < input.size() && input[pos + 2] == '$') {
+        // ''$ -> escape for $
+        raw_content += '$';
+        pos += 3;
+        continue;
+      } else if (pos + 2 < input.size() && input[pos + 2] == '\\') {
+        // ''\ -> check what follows
+        if (pos + 3 < input.size()) {
+          char next = input[pos + 3];
+          if (next == 'n') {
+            raw_content += '\n';
+            pos += 4;
+            continue;
+          } else if (next == 'r') {
+            raw_content += '\r';
+            pos += 4;
+            continue;
+          } else if (next == 't') {
+            raw_content += '\t';
+            pos += 4;
+            continue;
+          } else if (next == ' ' || next == '\t') {
+            // ''\ before whitespace - preserve the whitespace by prepending a marker
+            // We use a special escape sequence that won't appear in normal text
+            raw_content += "\x1F\x1F"; // Unit separator pair as marker for preserved whitespace
+            raw_content += next;
+            pos += 4;
+            continue;
+          }
+        }
+        // Default: literal backslash
+        raw_content += '\\';
+        pos += 3;
+        continue;
+      } else {
+        // Just closing ''
+        pos += 2;
+        break;
+      }
+    }
+
+    // Check for interpolation
+    if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
+      has_interp = true;
+      raw_content += input[pos];
+      pos++;
+      if (input[pos] == '\n') {
+        line++;
+      }
+      continue;
+    }
+
+    // Track newlines
+    if (input[pos] == '\n') {
+      line++;
+      raw_content += input[pos];
+      pos++;
+    } else {
+      raw_content += input[pos];
+      pos++;
+    }
+  }
+
+  // Strip common indentation
+  std::string stripped = strip_indentation(raw_content);
+
+  Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING;
+  tokens.push_back({type, stripped, start_line, col});
+}
+
+std::string Lexer::strip_indentation(const std::string& s) {
+  if (s.empty())
+    return s;
+
+  // Split into lines
+  std::vector<std::string> lines;
+  std::string current_line;
+  for (char c : s) {
+    if (c == '\n') {
+      lines.push_back(current_line);
+      current_line.clear();
+    } else {
+      current_line += c;
+    }
+  }
+  if (!current_line.empty() || (!s.empty() && s.back() == '\n')) {
+    lines.push_back(current_line);
+  }
+
+  // Find minimum indentation (spaces/tabs at start of non-empty lines)
+  // \x1F\x1F marker indicates preserved whitespace (from ''\ escape)
+  size_t min_indent = std::string::npos;
+  for (const auto& line : lines) {
+    if (line.empty())
+      continue; // Skip empty lines when calculating indentation
+    size_t indent = 0;
+    for (size_t i = 0; i < line.size(); i++) {
+      char c = line[i];
+      // If we hit the preserved whitespace marker, stop counting indentation
+      if (c == '\x1F' && i + 1 < line.size() && line[i + 1] == '\x1F') {
+        break;
+      }
+      if (c == ' ' || c == '\t')
+        indent++;
+      else
+        break;
+    }
+    if (indent < min_indent)
+      min_indent = indent;
+  }
+
+  if (min_indent == std::string::npos)
+    min_indent = 0;
+
+  // Strip min_indent from all lines and remove \x1F\x1F markers
+  std::string result;
+  for (size_t i = 0; i < lines.size(); i++) {
+    const auto& line = lines[i];
+    if (line.empty()) {
+      // Preserve empty lines
+      if (i + 1 < lines.size())
+        result += '\n';
+    } else {
+      // Strip indentation, being careful about \x1F\x1F markers
+      size_t skip = 0;
+      size_t pos = 0;
+      while (skip < min_indent && pos < line.size()) {
+        if (line[pos] == '\x1F' && pos + 1 < line.size() && line[pos + 1] == '\x1F') {
+          // Hit preserved whitespace marker - don't strip any more
+          break;
+        }
+        skip++;
+        pos++;
+      }
+
+      // Add the rest of the line, removing \x1F\x1F markers
+      for (size_t j = pos; j < line.size(); j++) {
+        if (line[j] == '\x1F' && j + 1 < line.size() && line[j + 1] == '\x1F') {
+          j++; // Skip both marker bytes
+          continue;
+        }
+        result += line[j];
+      }
+
+      if (i + 1 < lines.size())
+        result += '\n';
+    }
+  }
+
+  return result;
+}
+
+void Lexer::tokenize_path() {
+  size_t start = pos;
+  while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
+         input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
+         input[pos] != ';') {
+    pos++;
+  }
+  std::string path = input.substr(start, pos - start);
+  tokens.push_back({Token::PATH, path, line, col});
+  col += path.size();
+}
+
+void Lexer::tokenize_home_path() {
+  size_t start = pos;
+  pos++; // Skip ~
+  if (pos < input.size() && input[pos] == '/') {
+    // Home-relative path ~/something
+    while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
+           input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
+           input[pos] != ';') {
+      pos++;
+    }
+  }
+  std::string path = input.substr(start, pos - start);
+  tokens.push_back({Token::PATH, path, line, col});
+  col += path.size();
+}
+
+void Lexer::tokenize_int() {
+  size_t start = pos;
+  if (input[pos] == '-')
+    pos++;
+  while (pos < input.size() && isdigit(input[pos]))
+    pos++;
+  std::string num = input.substr(start, pos - start);
+  tokens.push_back({Token::INT, num, line, col});
+  col += num.size();
+}
+
+void Lexer::tokenize_float() {
+  size_t start = pos;
+  if (input[pos] == '-')
+    pos++;
+  while (pos < input.size() && isdigit(input[pos]))
+    pos++;
+  if (pos < input.size() && input[pos] == '.') {
+    pos++;
+    while (pos < input.size() && isdigit(input[pos]))
+      pos++;
+  }
+  std::string num = input.substr(start, pos - start);
+  tokens.push_back({Token::FLOAT, num, line, col});
+  col += num.size();
+}
+
+void Lexer::tokenize_uri() {
+  size_t start = pos;
+  while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' &&
+         input[pos] != ';') {
+    pos++;
+  }
+  std::string uri = input.substr(start, pos - start);
+  tokens.push_back({Token::URI, uri, line, col});
+  col += uri.size();
+}
+
+void Lexer::tokenize_ident() {
+  size_t start = pos;
+  // Note: Don't include '.' here - it's used for selection (a.b.c)
+  // URIs are handled separately by checking for '://' pattern
+  while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-'))
+    pos++;
+  std::string ident = input.substr(start, pos - start);
+
+  // Check if it's a URI (contains ://)
+  size_t scheme_end = ident.find("://");
+  if (scheme_end != std::string::npos && scheme_end > 0) {
+    tokens.push_back({Token::URI, ident, line, col});
+    col += ident.size();
+    return;
+  }
+
+  Token::Type type = Token::IDENT;
+  if (ident == "let")
+    type = Token::LET;
+  else if (ident == "in")
+    type = Token::IN;
+  else if (ident == "rec")
+    type = Token::REC;
+  else if (ident == "if")
+    type = Token::IF;
+  else if (ident == "then")
+    type = Token::THEN;
+  else if (ident == "else")
+    type = Token::ELSE;
+  else if (ident == "assert")
+    type = Token::ASSERT;
+  else if (ident == "with")
+    type = Token::WITH;
+  else if (ident == "inherit")
+    type = Token::INHERIT;
+  else if (ident == "import")
+    type = Token::IMPORT;
+  else if (ident == "true")
+    type = Token::BOOL;
+  else if (ident == "false")
+    type = Token::BOOL;
+
+  tokens.push_back({type, ident, line, col});
+  col += ident.size();
+}
+
+} // namespace nix_irc
--- a/src/irc/lexer.h
+++ b/src/irc/lexer.h
@ -0,0 +1,94 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace nix_irc {
+
+struct Token {
+  enum Type {
+    LPAREN,
+    RPAREN,
+    LBRACE,
+    RBRACE,
+    LBRACKET,
+    RBRACKET,
+    IDENT,
+    STRING,
+    STRING_INTERP,
+    INDENTED_STRING,
+    INDENTED_STRING_INTERP,
+    PATH,
+    LOOKUP_PATH,
+    INT,
+    FLOAT,
+    URI,
+    BOOL,
+    LET,
+    IN,
+    REC,
+    IF,
+    THEN,
+    ELSE,
+    ASSERT,
+    WITH,
+    INHERIT,
+    IMPORT,
+    DOT,
+    SEMICOLON,
+    COLON,
+    EQUALS,
+    AT,
+    COMMA,
+    QUESTION,
+    ELLIPSIS,
+    // Operators
+    PLUS,
+    MINUS,
+    STAR,
+    SLASH,
+    CONCAT,
+    MERGE,
+    EQEQ,
+    NE,
+    LT,
+    GT,
+    LE,
+    GE,
+    AND,
+    OR,
+    IMPL,
+    NOT,
+    EOF_
+  } type;
+  std::string value;
+  size_t line;
+  size_t col;
+};
+
+class Lexer {
+public:
+  Lexer(const std::string& input);
+  std::vector<Token> tokenize();
+
+private:
+  std::vector<Token> tokens;
+  const std::string& input;
+  size_t pos;
+  size_t line;
+  size_t col;
+
+  void emit(const Token& t);
+  void skip_whitespace();
+  void tokenize_string();
+  void tokenize_indented_string();
+  std::string strip_indentation(const std::string& s);
+  void tokenize_path();
+  void tokenize_home_path();
+  void tokenize_int();
+  void tokenize_float();
+  void tokenize_uri();
+  void tokenize_ident();
+};
+
+} // namespace nix_irc
--- a/src/irc/parser.cpp
+++ b/src/irc/parser.cpp
@ -1,4 +1,5 @@
 #include "parser.h"
+#include "lexer.h"
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@ -59,628 +60,6 @@ static std::pair<std::string, std::string> run_command(const std::string& cmd) {
  return {result, ""};
 }

-struct Token {
-  enum Type {
-    LPAREN,
-    RPAREN,
-    LBRACE,
-    RBRACE,
-    LBRACKET,
-    RBRACKET,
-    IDENT,
-    STRING,
-    STRING_INTERP,
-    INDENTED_STRING,
-    INDENTED_STRING_INTERP,
-    PATH,
-    LOOKUP_PATH,
-    INT,
-    FLOAT,
-    URI,
-    BOOL,
-    LET,
-    IN,
-    REC,
-    IF,
-    THEN,
-    ELSE,
-    ASSERT,
-    WITH,
-    INHERIT,
-    IMPORT,
-    DOT,
-    SEMICOLON,
-    COLON,
-    EQUALS,
-    AT,
-    COMMA,
-    QUESTION,
-    ELLIPSIS,
-    // Operators
-    PLUS,
-    MINUS,
-    STAR,
-    SLASH,
-    CONCAT,
-    MERGE,
-    EQEQ,
-    NE,
-    LT,
-    GT,
-    LE,
-    GE,
-    AND,
-    OR,
-    IMPL,
-    NOT,
-    EOF_
-  } type;
-  std::string value;
-  size_t line;
-  size_t col;
-};
-
-class Lexer {
-public:
-  Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {}
-
-  std::vector<Token> tokenize() {
-#define TOKEN(t)                                                                                   \
-  Token { Token::t, "", line, col }
-
-    while (pos < input.size()) {
-      skip_whitespace();
-      if (pos >= input.size())
-        break;
-
-      char c = input[pos];
-
-      if (c == '(') {
-        emit(TOKEN(LPAREN));
-      } else if (c == ')') {
-        emit(TOKEN(RPAREN));
-      } else if (c == '{') {
-        emit(TOKEN(LBRACE));
-      } else if (c == '}') {
-        emit(TOKEN(RBRACE));
-      } else if (c == '[') {
-        emit(TOKEN(LBRACKET));
-      } else if (c == ']') {
-        emit(TOKEN(RBRACKET));
-      } else if (c == ';') {
-        emit(TOKEN(SEMICOLON));
-      } else if (c == ':') {
-        emit(TOKEN(COLON));
-      } else if (c == '@') {
-        emit(TOKEN(AT));
-      } else if (c == ',') {
-        emit(TOKEN(COMMA));
-      } else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') {
-        tokenize_indented_string();
-      } else if (c == '"') {
-        tokenize_string();
-      }
-      // Two-char operators
-      else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') {
-        tokens.push_back(TOKEN(EQEQ));
-        pos += 2;
-        col += 2;
-      } else if (c == '=') {
-        emit(TOKEN(EQUALS));
-      } else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') {
-        tokens.push_back(TOKEN(NE));
-        pos += 2;
-        col += 2;
-      } else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') {
-        tokens.push_back(TOKEN(LE));
-        pos += 2;
-        col += 2;
-      } else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') {
-        tokens.push_back(TOKEN(GE));
-        pos += 2;
-        col += 2;
-      } else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') {
-        tokens.push_back(TOKEN(CONCAT));
-        pos += 2;
-        col += 2;
-      } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') {
-        tokens.push_back(TOKEN(MERGE));
-        pos += 2;
-        col += 2;
-      } else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') {
-        tokens.push_back(TOKEN(AND));
-        pos += 2;
-        col += 2;
-      } else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') {
-        tokens.push_back(TOKEN(OR));
-        pos += 2;
-        col += 2;
-      } else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') {
-        tokens.push_back(TOKEN(IMPL));
-        pos += 2;
-        col += 2;
-      }
-      // Single-char operators
-      else if (c == '+') {
-        emit(TOKEN(PLUS));
-      } else if (c == '*') {
-        emit(TOKEN(STAR));
-      } else if (c == '/') {
-        // Check if it's a path or division
-        if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) {
-          tokenize_path();
-        } else {
-          emit(TOKEN(SLASH));
-        }
-      } else if (c == '<') {
-        // Check for lookup path <nixpkgs> vs comparison operator
-        size_t end = pos + 1;
-        bool is_lookup_path = false;
-
-        // Scan for valid lookup path characters until >
-        while (end < input.size() &&
-               (isalnum(input[end]) || input[end] == '-' || input[end] == '_' ||
-                input[end] == '/' || input[end] == '.')) {
-          end++;
-        }
-
-        // If we found > and there's content, it's a lookup path
-        if (end < input.size() && input[end] == '>' && end > pos + 1) {
-          std::string path = input.substr(pos + 1, end - pos - 1);
-          tokens.push_back({Token::LOOKUP_PATH, path, line, col});
-          pos = end + 1;
-          col += (end - pos + 1);
-          is_lookup_path = true;
-        }
-
-        if (!is_lookup_path) {
-          emit(TOKEN(LT));
-        }
-      } else if (c == '>') {
-        emit(TOKEN(GT));
-      } else if (c == '!') {
-        emit(TOKEN(NOT));
-      } else if (c == '.') {
-        // Check for ellipsis (...)
-        if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') {
-          tokens.push_back(TOKEN(ELLIPSIS));
-          pos += 3;
-          col += 3;
-        } else {
-          emit(TOKEN(DOT));
-        }
-      } else if (c == '?') {
-        emit(TOKEN(QUESTION));
-      } else if (c == '~') {
-        // Home-relative path ~/...
-        if (pos + 1 < input.size() && input[pos + 1] == '/') {
-          tokenize_home_path();
-        } else {
-          // Just ~ by itself is an identifier
-          tokenize_ident();
-        }
-      } else if (c == '-') {
-        // Check if it's a negative number or minus operator
-        if (pos + 1 < input.size() && isdigit(input[pos + 1])) {
-          // Check for negative float
-          if (pos + 2 < input.size() && input[pos + 2] == '.') {
-            tokenize_float();
-          } else {
-            tokenize_int();
-          }
-        } else {
-          emit(TOKEN(MINUS));
-        }
-      } else if (isdigit(c)) {
-        // Check if it's a float (digit followed by '.')
-        if (pos + 1 < input.size() && input[pos + 1] == '.') {
-          tokenize_float();
-        } else {
-          tokenize_int();
-        }
-      } else if (isalpha(c)) {
-        // Check if it's a URI (contains ://) - look ahead
-        size_t lookahead = pos;
-        while (lookahead < input.size() &&
-               (isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' ||
-                input[lookahead] == '+' || input[lookahead] == '.'))
-          lookahead++;
-        std::string potential_scheme = input.substr(pos, lookahead - pos);
-        if (lookahead + 2 < input.size() && input[lookahead] == ':' &&
-            input[lookahead + 1] == '/' && input[lookahead + 2] == '/') {
-          // It's a URI, consume the whole thing
-          tokenize_uri();
-        } else {
-          tokenize_ident();
-        }
-      } else {
-        pos++;
-        col++;
-      }
-    }
-    tokens.push_back({Token::EOF_, "", line, col});
-
-#undef TOKEN
-    return tokens;
-  }
-
-private:
-  std::vector<Token> tokens;
-  const std::string& input;
-  size_t pos;
-  size_t line;
-  size_t col;
-
-  void emit(const Token& t) {
-    tokens.push_back(t);
-    pos++;
-    col++;
-  }
-
-  void skip_whitespace() {
-    while (pos < input.size()) {
-      char c = input[pos];
-      if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
-        if (c == '\n') {
-          line++;
-          col = 1;
-        } else {
-          col++;
-        }
-        pos++;
-      } else if (c == '#') {
-        // Line comment - skip until newline
-        while (pos < input.size() && input[pos] != '\n')
-          pos++;
-      } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') {
-        // Block comment /* ... */
-        // Note: Nix block comments do NOT nest
-        pos += 2; // Skip /*
-        while (pos + 1 < input.size()) {
-          if (input[pos] == '*' && input[pos + 1] == '/') {
-            pos += 2; // Skip */
-            break;
-          }
-          if (input[pos] == '\n') {
-            line++;
-            col = 1;
-          } else {
-            col++;
-          }
-          pos++;
-        }
-      } else {
-        break;
-      }
-    }
-  }
-
-  void tokenize_string() {
-    pos++;
-    std::string s;
-    bool has_interp = false;
-
-    while (pos < input.size() && input[pos] != '"') {
-      if (input[pos] == '\\' && pos + 1 < input.size()) {
-        pos++;
-        switch (input[pos]) {
-        case 'n':
-          s += '\n';
-          break;
-        case 't':
-          s += '\t';
-          break;
-        case 'r':
-          s += '\r';
-          break;
-        case '"':
-          s += '"';
-          break;
-        case '\\':
-          s += '\\';
-          break;
-        case '$':
-          s += '$';
-          break; // Escaped $
-        default:
-          s += input[pos];
-          break;
-        }
-        pos++;
-      } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
-        // Found interpolation marker
-        has_interp = true;
-        s += input[pos]; // Keep $ in raw string
-        pos++;
-      } else {
-        s += input[pos];
-        pos++;
-      }
-    }
-    pos++;
-
-    Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING;
-    tokens.push_back({type, s, line, col});
-    col += s.size() + 2;
-  }
-
-  void tokenize_indented_string() {
-    pos += 2; // Skip opening ''
-    std::string raw_content;
-    bool has_interp = false;
-    size_t start_line = line;
-
-    // Collect raw content until closing ''
-    while (pos < input.size()) {
-      // Check for escape sequences
-      if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') {
-        // Check if it's an escape or the closing delimiter
-        if (pos + 2 < input.size() && input[pos + 2] == '\'') {
-          // '''  -> escape for ''
-          raw_content += "''";
-          pos += 3;
-          continue;
-        } else if (pos + 2 < input.size() && input[pos + 2] == '$') {
-          // ''$ -> escape for $
-          raw_content += '$';
-          pos += 3;
-          continue;
-        } else if (pos + 2 < input.size() && input[pos + 2] == '\\') {
-          // ''\ -> check what follows
-          if (pos + 3 < input.size()) {
-            char next = input[pos + 3];
-            if (next == 'n') {
-              raw_content += '\n';
-              pos += 4;
-              continue;
-            } else if (next == 'r') {
-              raw_content += '\r';
-              pos += 4;
-              continue;
-            } else if (next == 't') {
-              raw_content += '\t';
-              pos += 4;
-              continue;
-            } else if (next == ' ' || next == '\t') {
-              // ''\ before whitespace - preserve the whitespace (mark it specially)
-              raw_content += "\x01"; // Use control char as marker for preserved whitespace
-              raw_content += next;
-              pos += 4;
-              continue;
-            }
-          }
-          // Default: literal backslash
-          raw_content += '\\';
-          pos += 3;
-          continue;
-        } else {
-          // Just closing ''
-          pos += 2;
-          break;
-        }
-      }
-
-      // Check for interpolation
-      if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
-        has_interp = true;
-        raw_content += input[pos];
-        pos++;
-        if (input[pos] == '\n') {
-          line++;
-        }
-        continue;
-      }
-
-      // Track newlines
-      if (input[pos] == '\n') {
-        line++;
-        raw_content += input[pos];
-        pos++;
-      } else {
-        raw_content += input[pos];
-        pos++;
-      }
-    }
-
-    // Strip common indentation
-    std::string stripped = strip_indentation(raw_content);
-
-    Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING;
-    tokens.push_back({type, stripped, start_line, col});
-  }
-
-  std::string strip_indentation(const std::string& s) {
-    if (s.empty())
-      return s;
-
-    // Split into lines
-    std::vector<std::string> lines;
-    std::string current_line;
-    for (char c : s) {
-      if (c == '\n') {
-        lines.push_back(current_line);
-        current_line.clear();
-      } else {
-        current_line += c;
-      }
-    }
-    if (!current_line.empty() || (!s.empty() && s.back() == '\n')) {
-      lines.push_back(current_line);
-    }
-
-    // Find minimum indentation (spaces/tabs at start of non-empty lines)
-    // \x01 marker indicates preserved whitespace (from ''\ escape)
-    size_t min_indent = std::string::npos;
-    for (const auto& line : lines) {
-      if (line.empty())
-        continue; // Skip empty lines when calculating indentation
-      size_t indent = 0;
-      for (size_t i = 0; i < line.size(); i++) {
-        char c = line[i];
-        // If we hit the preserved whitespace marker, stop counting indentation
-        if (c == '\x01')
-          break;
-        if (c == ' ' || c == '\t')
-          indent++;
-        else
-          break;
-      }
-      if (indent < min_indent)
-        min_indent = indent;
-    }
-
-    if (min_indent == std::string::npos)
-      min_indent = 0;
-
-    // Strip min_indent from all lines and remove \x01 markers
-    std::string result;
-    for (size_t i = 0; i < lines.size(); i++) {
-      const auto& line = lines[i];
-      if (line.empty()) {
-        // Preserve empty lines
-        if (i + 1 < lines.size())
-          result += '\n';
-      } else {
-        // Strip indentation, being careful about \x01 markers
-        size_t skip = 0;
-        size_t pos = 0;
-        while (skip < min_indent && pos < line.size()) {
-          if (line[pos] == '\x01') {
-            // Hit preserved whitespace marker - don't strip any more
-            break;
-          }
-          skip++;
-          pos++;
-        }
-
-        // Add the rest of the line, removing \x01 markers
-        for (size_t j = pos; j < line.size(); j++) {
-          if (line[j] != '\x01') {
-            result += line[j];
-          }
-        }
-
-        if (i + 1 < lines.size())
-          result += '\n';
-      }
-    }
-
-    return result;
-  }
-
-  void tokenize_path() {
-    size_t start = pos;
-    while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
-           input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
-           input[pos] != ';') {
-      pos++;
-    }
-    std::string path = input.substr(start, pos - start);
-    tokens.push_back({Token::PATH, path, line, col});
-    col += path.size();
-  }
-
-  void tokenize_home_path() {
-    size_t start = pos;
-    pos++; // Skip ~
-    if (pos < input.size() && input[pos] == '/') {
-      // Home-relative path ~/something
-      while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
-             input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
-             input[pos] != ';') {
-        pos++;
-      }
-    }
-    std::string path = input.substr(start, pos - start);
-    tokens.push_back({Token::PATH, path, line, col});
-    col += path.size();
-  }
-
-  void tokenize_int() {
-    size_t start = pos;
-    if (input[pos] == '-')
-      pos++;
-    while (pos < input.size() && isdigit(input[pos]))
-      pos++;
-    std::string num = input.substr(start, pos - start);
-    tokens.push_back({Token::INT, num, line, col});
-    col += num.size();
-  }
-
-  void tokenize_float() {
-    size_t start = pos;
-    if (input[pos] == '-')
-      pos++;
-    while (pos < input.size() && isdigit(input[pos]))
-      pos++;
-    if (pos < input.size() && input[pos] == '.') {
-      pos++;
-      while (pos < input.size() && isdigit(input[pos]))
-        pos++;
-    }
-    std::string num = input.substr(start, pos - start);
-    tokens.push_back({Token::FLOAT, num, line, col});
-    col += num.size();
-  }
-
-  void tokenize_uri() {
-    size_t start = pos;
-    while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' &&
-           input[pos] != ';') {
-      pos++;
-    }
-    std::string uri = input.substr(start, pos - start);
-    tokens.push_back({Token::URI, uri, line, col});
-    col += uri.size();
-  }
-
-  void tokenize_ident() {
-    size_t start = pos;
-    // Note: Don't include '.' here - it's used for selection (a.b.c)
-    // URIs are handled separately by checking for '://' pattern
-    while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-'))
-      pos++;
-    std::string ident = input.substr(start, pos - start);
-
-    // Check if it's a URI (contains ://)
-    size_t scheme_end = ident.find("://");
-    if (scheme_end != std::string::npos && scheme_end > 0) {
-      tokens.push_back({Token::URI, ident, line, col});
-      col += ident.size();
-      return;
-    }
-
-    Token::Type type = Token::IDENT;
-    if (ident == "let")
-      type = Token::LET;
-    else if (ident == "in")
-      type = Token::IN;
-    else if (ident == "rec")
-      type = Token::REC;
-    else if (ident == "if")
-      type = Token::IF;
-    else if (ident == "then")
-      type = Token::THEN;
-    else if (ident == "else")
-      type = Token::ELSE;
-    else if (ident == "assert")
-      type = Token::ASSERT;
-    else if (ident == "with")
-      type = Token::WITH;
-    else if (ident == "inherit")
-      type = Token::INHERIT;
-    else if (ident == "import")
-      type = Token::IMPORT;
-    else if (ident == "true")
-      type = Token::BOOL;
-    else if (ident == "false")
-      type = Token::BOOL;
-
-    tokens.push_back({type, ident, line, col});
-    col += ident.size();
-  }
-};
-
 class Parser::Impl {
 public:
  std::vector<Token> tokens;
@ -706,9 +85,9 @@ public:

  bool expect(Token::Type type) {
    if (current().type != type) {
-      std::cerr << "Expected token " << type << " but got " << current().type << " at "
-                << current().line << ":" << current().col << "\n";
-      return false;
+      throw std::runtime_error(
+          "Expected token " + std::to_string(type) + " but got " + std::to_string(current().type) +
+          " at " + std::to_string(current().line) + ":" + std::to_string(current().col));
    }
    advance();
    return true;
@ -718,7 +97,7 @@ public:
  int get_precedence(Token::Type type) {
    switch (type) {
    case Token::MERGE:
-      return 1; // Low precedence - binds loosely, but must be > 0 to be recognized as binary op
+      return 1; // low precedence - binds loosely, but must be > 0 to be recognized as binary op
    case Token::OR:
      return 1;
    case Token::AND:
@ -942,9 +321,10 @@ public:
      } else if (current().type == Token::IDENT || current().type == Token::INT ||
                 current().type == Token::FLOAT || current().type == Token::BOOL ||
                 current().type == Token::PATH || current().type == Token::LOOKUP_PATH ||
-                 current().type == Token::URI || current().type == Token::LBRACKET) {
+                 current().type == Token::URI || current().type == Token::LBRACKET ||
+                 current().type == Token::LBRACE) {
        // Juxtaposition application: f x
-        // Parse the argument as a primary expression (which handles lists, etc.)
+        // Parse the argument as a primary expression (which handles lists, attrsets, etc.)
        auto arg = parse_expr3();
        left = std::make_shared<Node>(AppNode(left, arg));
      } else {
@ -1056,9 +436,8 @@ public:
      return std::make_shared<Node>(ConstBoolNode(t.value == "true"));
    }

-    std::cerr << "Unknown token: " << t.value << " (type " << t.type << ")\n";
-    advance();
-    return std::make_shared<Node>(ConstNullNode());
+    throw std::runtime_error("Unknown token: " + t.value + " (type " + std::to_string(t.type) +
+                             ")");
  }

  std::shared_ptr<Node> parse_attrs() {