irc: split parser into lexer and parser components

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I4e73459a02caff5335d690656fd6f1396a6a6964
This commit is contained in:
raf 2026-04-24 14:39:51 +03:00
commit 0a5920adaf
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
3 changed files with 666 additions and 631 deletions

562
src/irc/lexer.cpp Normal file
View file

@ -0,0 +1,562 @@
#include "lexer.h"
#include <cctype>
namespace nix_irc {
Lexer::Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {}
std::vector<Token> Lexer::tokenize() {
#define TOKEN(t) \
Token { \
Token::t, "", line, col \
}
while (pos < input.size()) {
skip_whitespace();
if (pos >= input.size())
break;
char c = input[pos];
if (c == '(') {
emit(TOKEN(LPAREN));
} else if (c == ')') {
emit(TOKEN(RPAREN));
} else if (c == '{') {
emit(TOKEN(LBRACE));
} else if (c == '}') {
emit(TOKEN(RBRACE));
} else if (c == '[') {
emit(TOKEN(LBRACKET));
} else if (c == ']') {
emit(TOKEN(RBRACKET));
} else if (c == ';') {
emit(TOKEN(SEMICOLON));
} else if (c == ':') {
emit(TOKEN(COLON));
} else if (c == '@') {
emit(TOKEN(AT));
} else if (c == ',') {
emit(TOKEN(COMMA));
} else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') {
tokenize_indented_string();
} else if (c == '"') {
tokenize_string();
}
// Two-char operators
else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(EQEQ));
pos += 2;
col += 2;
} else if (c == '=') {
emit(TOKEN(EQUALS));
} else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(NE));
pos += 2;
col += 2;
} else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(LE));
pos += 2;
col += 2;
} else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(GE));
pos += 2;
col += 2;
} else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') {
tokens.push_back(TOKEN(CONCAT));
pos += 2;
col += 2;
} else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') {
tokens.push_back(TOKEN(MERGE));
pos += 2;
col += 2;
} else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') {
tokens.push_back(TOKEN(AND));
pos += 2;
col += 2;
} else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') {
tokens.push_back(TOKEN(OR));
pos += 2;
col += 2;
} else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') {
tokens.push_back(TOKEN(IMPL));
pos += 2;
col += 2;
}
// Single-char operators
else if (c == '+') {
emit(TOKEN(PLUS));
} else if (c == '*') {
emit(TOKEN(STAR));
} else if (c == '/') {
// Check if it's a path or division
if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) {
tokenize_path();
} else {
emit(TOKEN(SLASH));
}
} else if (c == '<') {
// Check for lookup path <nixpkgs> vs comparison operator
size_t end = pos + 1;
bool is_lookup_path = false;
// Scan for valid lookup path characters until >
while (end < input.size() && (isalnum(input[end]) || input[end] == '-' || input[end] == '_' ||
input[end] == '/' || input[end] == '.')) {
end++;
}
// If we found > and there's content, it's a lookup path
if (end < input.size() && input[end] == '>' && end > pos + 1) {
std::string path = input.substr(pos + 1, end - pos - 1);
tokens.push_back({Token::LOOKUP_PATH, path, line, col});
pos = end + 1;
col += (end - pos + 1);
is_lookup_path = true;
}
if (!is_lookup_path) {
emit(TOKEN(LT));
}
} else if (c == '>') {
emit(TOKEN(GT));
} else if (c == '!') {
emit(TOKEN(NOT));
} else if (c == '.') {
// Check for ellipsis (...)
if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') {
tokens.push_back(TOKEN(ELLIPSIS));
pos += 3;
col += 3;
} else {
emit(TOKEN(DOT));
}
} else if (c == '?') {
emit(TOKEN(QUESTION));
} else if (c == '~') {
// Home-relative path ~/...
if (pos + 1 < input.size() && input[pos + 1] == '/') {
tokenize_home_path();
} else {
// Just ~ by itself is an identifier
tokenize_ident();
}
} else if (c == '-') {
// Check if it's a negative number or minus operator
if (pos + 1 < input.size() && isdigit(input[pos + 1])) {
// Check for negative float
if (pos + 2 < input.size() && input[pos + 2] == '.') {
tokenize_float();
} else {
tokenize_int();
}
} else {
emit(TOKEN(MINUS));
}
} else if (isdigit(c)) {
// Check if it's a float (digit followed by '.')
if (pos + 1 < input.size() && input[pos + 1] == '.') {
tokenize_float();
} else {
tokenize_int();
}
} else if (isalpha(c)) {
// Check if it's a URI (contains ://) - look ahead
size_t lookahead = pos;
while (lookahead < input.size() &&
(isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' ||
input[lookahead] == '+' || input[lookahead] == '.'))
lookahead++;
std::string potential_scheme = input.substr(pos, lookahead - pos);
if (lookahead + 2 < input.size() && input[lookahead] == ':' && input[lookahead + 1] == '/' &&
input[lookahead + 2] == '/') {
// It's a URI, consume the whole thing
tokenize_uri();
} else {
tokenize_ident();
}
} else {
pos++;
col++;
}
}
tokens.push_back({Token::EOF_, "", line, col});
#undef TOKEN
return tokens;
}
void Lexer::emit(const Token& t) {
tokens.push_back(t);
pos++;
col++;
}
void Lexer::skip_whitespace() {
while (pos < input.size()) {
char c = input[pos];
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
if (c == '\n') {
line++;
col = 1;
} else {
col++;
}
pos++;
} else if (c == '#') {
// Line comment - skip until newline
while (pos < input.size() && input[pos] != '\n')
pos++;
} else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') {
// Block comment /* ... */
// Note: Nix block comments do NOT nest
pos += 2; // Skip /*
while (pos + 1 < input.size()) {
if (input[pos] == '*' && input[pos + 1] == '/') {
pos += 2; // Skip */
break;
}
if (input[pos] == '\n') {
line++;
col = 1;
} else {
col++;
}
pos++;
}
} else {
break;
}
}
}
void Lexer::tokenize_string() {
pos++;
std::string s;
bool has_interp = false;
while (pos < input.size() && input[pos] != '"') {
if (input[pos] == '\\' && pos + 1 < input.size()) {
pos++;
switch (input[pos]) {
case 'n':
s += '\n';
break;
case 't':
s += '\t';
break;
case 'r':
s += '\r';
break;
case '"':
s += '"';
break;
case '\\':
s += '\\';
break;
case '$':
s += '$';
break; // Escaped $
default:
s += input[pos];
break;
}
pos++;
} else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
// Found interpolation marker
has_interp = true;
s += input[pos]; // Keep $ in raw string
pos++;
} else {
s += input[pos];
pos++;
}
}
pos++;
Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING;
tokens.push_back({type, s, line, col});
col += s.size() + 2;
}
void Lexer::tokenize_indented_string() {
pos += 2; // Skip opening ''
std::string raw_content;
bool has_interp = false;
size_t start_line = line;
// Collect raw content until closing ''
while (pos < input.size()) {
// Check for escape sequences
if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') {
// Check if it's an escape or the closing delimiter
if (pos + 2 < input.size() && input[pos + 2] == '\'') {
// ''' -> escape for ''
raw_content += "''";
pos += 3;
continue;
} else if (pos + 2 < input.size() && input[pos + 2] == '$') {
// ''$ -> escape for $
raw_content += '$';
pos += 3;
continue;
} else if (pos + 2 < input.size() && input[pos + 2] == '\\') {
// ''\ -> check what follows
if (pos + 3 < input.size()) {
char next = input[pos + 3];
if (next == 'n') {
raw_content += '\n';
pos += 4;
continue;
} else if (next == 'r') {
raw_content += '\r';
pos += 4;
continue;
} else if (next == 't') {
raw_content += '\t';
pos += 4;
continue;
} else if (next == ' ' || next == '\t') {
// ''\ before whitespace - preserve the whitespace by prepending a marker
// We use a special escape sequence that won't appear in normal text
raw_content += "\x1F\x1F"; // Unit separator pair as marker for preserved whitespace
raw_content += next;
pos += 4;
continue;
}
}
// Default: literal backslash
raw_content += '\\';
pos += 3;
continue;
} else {
// Just closing ''
pos += 2;
break;
}
}
// Check for interpolation
if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
has_interp = true;
raw_content += input[pos];
pos++;
if (input[pos] == '\n') {
line++;
}
continue;
}
// Track newlines
if (input[pos] == '\n') {
line++;
raw_content += input[pos];
pos++;
} else {
raw_content += input[pos];
pos++;
}
}
// Strip common indentation
std::string stripped = strip_indentation(raw_content);
Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING;
tokens.push_back({type, stripped, start_line, col});
}
std::string Lexer::strip_indentation(const std::string& s) {
if (s.empty())
return s;
// Split into lines
std::vector<std::string> lines;
std::string current_line;
for (char c : s) {
if (c == '\n') {
lines.push_back(current_line);
current_line.clear();
} else {
current_line += c;
}
}
if (!current_line.empty() || (!s.empty() && s.back() == '\n')) {
lines.push_back(current_line);
}
// Find minimum indentation (spaces/tabs at start of non-empty lines)
// \x1F\x1F marker indicates preserved whitespace (from ''\ escape)
size_t min_indent = std::string::npos;
for (const auto& line : lines) {
if (line.empty())
continue; // Skip empty lines when calculating indentation
size_t indent = 0;
for (size_t i = 0; i < line.size(); i++) {
char c = line[i];
// If we hit the preserved whitespace marker, stop counting indentation
if (c == '\x1F' && i + 1 < line.size() && line[i + 1] == '\x1F') {
break;
}
if (c == ' ' || c == '\t')
indent++;
else
break;
}
if (indent < min_indent)
min_indent = indent;
}
if (min_indent == std::string::npos)
min_indent = 0;
// Strip min_indent from all lines and remove \x1F\x1F markers
std::string result;
for (size_t i = 0; i < lines.size(); i++) {
const auto& line = lines[i];
if (line.empty()) {
// Preserve empty lines
if (i + 1 < lines.size())
result += '\n';
} else {
// Strip indentation, being careful about \x1F\x1F markers
size_t skip = 0;
size_t pos = 0;
while (skip < min_indent && pos < line.size()) {
if (line[pos] == '\x1F' && pos + 1 < line.size() && line[pos + 1] == '\x1F') {
// Hit preserved whitespace marker - don't strip any more
break;
}
skip++;
pos++;
}
// Add the rest of the line, removing \x1F\x1F markers
for (size_t j = pos; j < line.size(); j++) {
if (line[j] == '\x1F' && j + 1 < line.size() && line[j + 1] == '\x1F') {
j++; // Skip both marker bytes
continue;
}
result += line[j];
}
if (i + 1 < lines.size())
result += '\n';
}
}
return result;
}
void Lexer::tokenize_path() {
size_t start = pos;
while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
input[pos] != ';') {
pos++;
}
std::string path = input.substr(start, pos - start);
tokens.push_back({Token::PATH, path, line, col});
col += path.size();
}
void Lexer::tokenize_home_path() {
size_t start = pos;
pos++; // Skip ~
if (pos < input.size() && input[pos] == '/') {
// Home-relative path ~/something
while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
input[pos] != ';') {
pos++;
}
}
std::string path = input.substr(start, pos - start);
tokens.push_back({Token::PATH, path, line, col});
col += path.size();
}
void Lexer::tokenize_int() {
size_t start = pos;
if (input[pos] == '-')
pos++;
while (pos < input.size() && isdigit(input[pos]))
pos++;
std::string num = input.substr(start, pos - start);
tokens.push_back({Token::INT, num, line, col});
col += num.size();
}
void Lexer::tokenize_float() {
size_t start = pos;
if (input[pos] == '-')
pos++;
while (pos < input.size() && isdigit(input[pos]))
pos++;
if (pos < input.size() && input[pos] == '.') {
pos++;
while (pos < input.size() && isdigit(input[pos]))
pos++;
}
std::string num = input.substr(start, pos - start);
tokens.push_back({Token::FLOAT, num, line, col});
col += num.size();
}
void Lexer::tokenize_uri() {
size_t start = pos;
while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' &&
input[pos] != ';') {
pos++;
}
std::string uri = input.substr(start, pos - start);
tokens.push_back({Token::URI, uri, line, col});
col += uri.size();
}
void Lexer::tokenize_ident() {
size_t start = pos;
// Note: Don't include '.' here - it's used for selection (a.b.c)
// URIs are handled separately by checking for '://' pattern
while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-'))
pos++;
std::string ident = input.substr(start, pos - start);
// Check if it's a URI (contains ://)
size_t scheme_end = ident.find("://");
if (scheme_end != std::string::npos && scheme_end > 0) {
tokens.push_back({Token::URI, ident, line, col});
col += ident.size();
return;
}
Token::Type type = Token::IDENT;
if (ident == "let")
type = Token::LET;
else if (ident == "in")
type = Token::IN;
else if (ident == "rec")
type = Token::REC;
else if (ident == "if")
type = Token::IF;
else if (ident == "then")
type = Token::THEN;
else if (ident == "else")
type = Token::ELSE;
else if (ident == "assert")
type = Token::ASSERT;
else if (ident == "with")
type = Token::WITH;
else if (ident == "inherit")
type = Token::INHERIT;
else if (ident == "import")
type = Token::IMPORT;
else if (ident == "true")
type = Token::BOOL;
else if (ident == "false")
type = Token::BOOL;
tokens.push_back({type, ident, line, col});
col += ident.size();
}
} // namespace nix_irc

94
src/irc/lexer.h Normal file
View file

@ -0,0 +1,94 @@
#pragma once
#include <string>
#include <vector>
namespace nix_irc {
struct Token {
enum Type {
LPAREN,
RPAREN,
LBRACE,
RBRACE,
LBRACKET,
RBRACKET,
IDENT,
STRING,
STRING_INTERP,
INDENTED_STRING,
INDENTED_STRING_INTERP,
PATH,
LOOKUP_PATH,
INT,
FLOAT,
URI,
BOOL,
LET,
IN,
REC,
IF,
THEN,
ELSE,
ASSERT,
WITH,
INHERIT,
IMPORT,
DOT,
SEMICOLON,
COLON,
EQUALS,
AT,
COMMA,
QUESTION,
ELLIPSIS,
// Operators
PLUS,
MINUS,
STAR,
SLASH,
CONCAT,
MERGE,
EQEQ,
NE,
LT,
GT,
LE,
GE,
AND,
OR,
IMPL,
NOT,
EOF_
} type;
std::string value;
size_t line;
size_t col;
};
class Lexer {
public:
Lexer(const std::string& input);
std::vector<Token> tokenize();
private:
std::vector<Token> tokens;
const std::string& input;
size_t pos;
size_t line;
size_t col;
void emit(const Token& t);
void skip_whitespace();
void tokenize_string();
void tokenize_indented_string();
std::string strip_indentation(const std::string& s);
void tokenize_path();
void tokenize_home_path();
void tokenize_int();
void tokenize_float();
void tokenize_uri();
void tokenize_ident();
};
} // namespace nix_irc

View file

@ -1,4 +1,5 @@
#include "parser.h" #include "parser.h"
#include "lexer.h"
#include <array> #include <array>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
@ -59,628 +60,6 @@ static std::pair<std::string, std::string> run_command(const std::string& cmd) {
return {result, ""}; return {result, ""};
} }
struct Token {
enum Type {
LPAREN,
RPAREN,
LBRACE,
RBRACE,
LBRACKET,
RBRACKET,
IDENT,
STRING,
STRING_INTERP,
INDENTED_STRING,
INDENTED_STRING_INTERP,
PATH,
LOOKUP_PATH,
INT,
FLOAT,
URI,
BOOL,
LET,
IN,
REC,
IF,
THEN,
ELSE,
ASSERT,
WITH,
INHERIT,
IMPORT,
DOT,
SEMICOLON,
COLON,
EQUALS,
AT,
COMMA,
QUESTION,
ELLIPSIS,
// Operators
PLUS,
MINUS,
STAR,
SLASH,
CONCAT,
MERGE,
EQEQ,
NE,
LT,
GT,
LE,
GE,
AND,
OR,
IMPL,
NOT,
EOF_
} type;
std::string value;
size_t line;
size_t col;
};
class Lexer {
public:
Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {}
std::vector<Token> tokenize() {
#define TOKEN(t) \
Token { Token::t, "", line, col }
while (pos < input.size()) {
skip_whitespace();
if (pos >= input.size())
break;
char c = input[pos];
if (c == '(') {
emit(TOKEN(LPAREN));
} else if (c == ')') {
emit(TOKEN(RPAREN));
} else if (c == '{') {
emit(TOKEN(LBRACE));
} else if (c == '}') {
emit(TOKEN(RBRACE));
} else if (c == '[') {
emit(TOKEN(LBRACKET));
} else if (c == ']') {
emit(TOKEN(RBRACKET));
} else if (c == ';') {
emit(TOKEN(SEMICOLON));
} else if (c == ':') {
emit(TOKEN(COLON));
} else if (c == '@') {
emit(TOKEN(AT));
} else if (c == ',') {
emit(TOKEN(COMMA));
} else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') {
tokenize_indented_string();
} else if (c == '"') {
tokenize_string();
}
// Two-char operators
else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(EQEQ));
pos += 2;
col += 2;
} else if (c == '=') {
emit(TOKEN(EQUALS));
} else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(NE));
pos += 2;
col += 2;
} else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(LE));
pos += 2;
col += 2;
} else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') {
tokens.push_back(TOKEN(GE));
pos += 2;
col += 2;
} else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') {
tokens.push_back(TOKEN(CONCAT));
pos += 2;
col += 2;
} else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') {
tokens.push_back(TOKEN(MERGE));
pos += 2;
col += 2;
} else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') {
tokens.push_back(TOKEN(AND));
pos += 2;
col += 2;
} else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') {
tokens.push_back(TOKEN(OR));
pos += 2;
col += 2;
} else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') {
tokens.push_back(TOKEN(IMPL));
pos += 2;
col += 2;
}
// Single-char operators
else if (c == '+') {
emit(TOKEN(PLUS));
} else if (c == '*') {
emit(TOKEN(STAR));
} else if (c == '/') {
// Check if it's a path or division
if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) {
tokenize_path();
} else {
emit(TOKEN(SLASH));
}
} else if (c == '<') {
// Check for lookup path <nixpkgs> vs comparison operator
size_t end = pos + 1;
bool is_lookup_path = false;
// Scan for valid lookup path characters until >
while (end < input.size() &&
(isalnum(input[end]) || input[end] == '-' || input[end] == '_' ||
input[end] == '/' || input[end] == '.')) {
end++;
}
// If we found > and there's content, it's a lookup path
if (end < input.size() && input[end] == '>' && end > pos + 1) {
std::string path = input.substr(pos + 1, end - pos - 1);
tokens.push_back({Token::LOOKUP_PATH, path, line, col});
pos = end + 1;
col += (end - pos + 1);
is_lookup_path = true;
}
if (!is_lookup_path) {
emit(TOKEN(LT));
}
} else if (c == '>') {
emit(TOKEN(GT));
} else if (c == '!') {
emit(TOKEN(NOT));
} else if (c == '.') {
// Check for ellipsis (...)
if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') {
tokens.push_back(TOKEN(ELLIPSIS));
pos += 3;
col += 3;
} else {
emit(TOKEN(DOT));
}
} else if (c == '?') {
emit(TOKEN(QUESTION));
} else if (c == '~') {
// Home-relative path ~/...
if (pos + 1 < input.size() && input[pos + 1] == '/') {
tokenize_home_path();
} else {
// Just ~ by itself is an identifier
tokenize_ident();
}
} else if (c == '-') {
// Check if it's a negative number or minus operator
if (pos + 1 < input.size() && isdigit(input[pos + 1])) {
// Check for negative float
if (pos + 2 < input.size() && input[pos + 2] == '.') {
tokenize_float();
} else {
tokenize_int();
}
} else {
emit(TOKEN(MINUS));
}
} else if (isdigit(c)) {
// Check if it's a float (digit followed by '.')
if (pos + 1 < input.size() && input[pos + 1] == '.') {
tokenize_float();
} else {
tokenize_int();
}
} else if (isalpha(c)) {
// Check if it's a URI (contains ://) - look ahead
size_t lookahead = pos;
while (lookahead < input.size() &&
(isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' ||
input[lookahead] == '+' || input[lookahead] == '.'))
lookahead++;
std::string potential_scheme = input.substr(pos, lookahead - pos);
if (lookahead + 2 < input.size() && input[lookahead] == ':' &&
input[lookahead + 1] == '/' && input[lookahead + 2] == '/') {
// It's a URI, consume the whole thing
tokenize_uri();
} else {
tokenize_ident();
}
} else {
pos++;
col++;
}
}
tokens.push_back({Token::EOF_, "", line, col});
#undef TOKEN
return tokens;
}
private:
std::vector<Token> tokens;
const std::string& input;
size_t pos;
size_t line;
size_t col;
void emit(const Token& t) {
tokens.push_back(t);
pos++;
col++;
}
void skip_whitespace() {
while (pos < input.size()) {
char c = input[pos];
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
if (c == '\n') {
line++;
col = 1;
} else {
col++;
}
pos++;
} else if (c == '#') {
// Line comment - skip until newline
while (pos < input.size() && input[pos] != '\n')
pos++;
} else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') {
// Block comment /* ... */
// Note: Nix block comments do NOT nest
pos += 2; // Skip /*
while (pos + 1 < input.size()) {
if (input[pos] == '*' && input[pos + 1] == '/') {
pos += 2; // Skip */
break;
}
if (input[pos] == '\n') {
line++;
col = 1;
} else {
col++;
}
pos++;
}
} else {
break;
}
}
}
void tokenize_string() {
pos++;
std::string s;
bool has_interp = false;
while (pos < input.size() && input[pos] != '"') {
if (input[pos] == '\\' && pos + 1 < input.size()) {
pos++;
switch (input[pos]) {
case 'n':
s += '\n';
break;
case 't':
s += '\t';
break;
case 'r':
s += '\r';
break;
case '"':
s += '"';
break;
case '\\':
s += '\\';
break;
case '$':
s += '$';
break; // Escaped $
default:
s += input[pos];
break;
}
pos++;
} else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
// Found interpolation marker
has_interp = true;
s += input[pos]; // Keep $ in raw string
pos++;
} else {
s += input[pos];
pos++;
}
}
pos++;
Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING;
tokens.push_back({type, s, line, col});
col += s.size() + 2;
}
void tokenize_indented_string() {
pos += 2; // Skip opening ''
std::string raw_content;
bool has_interp = false;
size_t start_line = line;
// Collect raw content until closing ''
while (pos < input.size()) {
// Check for escape sequences
if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') {
// Check if it's an escape or the closing delimiter
if (pos + 2 < input.size() && input[pos + 2] == '\'') {
// ''' -> escape for ''
raw_content += "''";
pos += 3;
continue;
} else if (pos + 2 < input.size() && input[pos + 2] == '$') {
// ''$ -> escape for $
raw_content += '$';
pos += 3;
continue;
} else if (pos + 2 < input.size() && input[pos + 2] == '\\') {
// ''\ -> check what follows
if (pos + 3 < input.size()) {
char next = input[pos + 3];
if (next == 'n') {
raw_content += '\n';
pos += 4;
continue;
} else if (next == 'r') {
raw_content += '\r';
pos += 4;
continue;
} else if (next == 't') {
raw_content += '\t';
pos += 4;
continue;
} else if (next == ' ' || next == '\t') {
// ''\ before whitespace - preserve the whitespace (mark it specially)
raw_content += "\x01"; // Use control char as marker for preserved whitespace
raw_content += next;
pos += 4;
continue;
}
}
// Default: literal backslash
raw_content += '\\';
pos += 3;
continue;
} else {
// Just closing ''
pos += 2;
break;
}
}
// Check for interpolation
if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') {
has_interp = true;
raw_content += input[pos];
pos++;
if (input[pos] == '\n') {
line++;
}
continue;
}
// Track newlines
if (input[pos] == '\n') {
line++;
raw_content += input[pos];
pos++;
} else {
raw_content += input[pos];
pos++;
}
}
// Strip common indentation
std::string stripped = strip_indentation(raw_content);
Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING;
tokens.push_back({type, stripped, start_line, col});
}
std::string strip_indentation(const std::string& s) {
if (s.empty())
return s;
// Split into lines
std::vector<std::string> lines;
std::string current_line;
for (char c : s) {
if (c == '\n') {
lines.push_back(current_line);
current_line.clear();
} else {
current_line += c;
}
}
if (!current_line.empty() || (!s.empty() && s.back() == '\n')) {
lines.push_back(current_line);
}
// Find minimum indentation (spaces/tabs at start of non-empty lines)
// \x01 marker indicates preserved whitespace (from ''\ escape)
size_t min_indent = std::string::npos;
for (const auto& line : lines) {
if (line.empty())
continue; // Skip empty lines when calculating indentation
size_t indent = 0;
for (size_t i = 0; i < line.size(); i++) {
char c = line[i];
// If we hit the preserved whitespace marker, stop counting indentation
if (c == '\x01')
break;
if (c == ' ' || c == '\t')
indent++;
else
break;
}
if (indent < min_indent)
min_indent = indent;
}
if (min_indent == std::string::npos)
min_indent = 0;
// Strip min_indent from all lines and remove \x01 markers
std::string result;
for (size_t i = 0; i < lines.size(); i++) {
const auto& line = lines[i];
if (line.empty()) {
// Preserve empty lines
if (i + 1 < lines.size())
result += '\n';
} else {
// Strip indentation, being careful about \x01 markers
size_t skip = 0;
size_t pos = 0;
while (skip < min_indent && pos < line.size()) {
if (line[pos] == '\x01') {
// Hit preserved whitespace marker - don't strip any more
break;
}
skip++;
pos++;
}
// Add the rest of the line, removing \x01 markers
for (size_t j = pos; j < line.size(); j++) {
if (line[j] != '\x01') {
result += line[j];
}
}
if (i + 1 < lines.size())
result += '\n';
}
}
return result;
}
void tokenize_path() {
size_t start = pos;
while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
input[pos] != ';') {
pos++;
}
std::string path = input.substr(start, pos - start);
tokens.push_back({Token::PATH, path, line, col});
col += path.size();
}
void tokenize_home_path() {
size_t start = pos;
pos++; // Skip ~
if (pos < input.size() && input[pos] == '/') {
// Home-relative path ~/something
while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' &&
input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' &&
input[pos] != ';') {
pos++;
}
}
std::string path = input.substr(start, pos - start);
tokens.push_back({Token::PATH, path, line, col});
col += path.size();
}
void tokenize_int() {
size_t start = pos;
if (input[pos] == '-')
pos++;
while (pos < input.size() && isdigit(input[pos]))
pos++;
std::string num = input.substr(start, pos - start);
tokens.push_back({Token::INT, num, line, col});
col += num.size();
}
void tokenize_float() {
size_t start = pos;
if (input[pos] == '-')
pos++;
while (pos < input.size() && isdigit(input[pos]))
pos++;
if (pos < input.size() && input[pos] == '.') {
pos++;
while (pos < input.size() && isdigit(input[pos]))
pos++;
}
std::string num = input.substr(start, pos - start);
tokens.push_back({Token::FLOAT, num, line, col});
col += num.size();
}
void tokenize_uri() {
size_t start = pos;
while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' &&
input[pos] != ';') {
pos++;
}
std::string uri = input.substr(start, pos - start);
tokens.push_back({Token::URI, uri, line, col});
col += uri.size();
}
void tokenize_ident() {
size_t start = pos;
// Note: Don't include '.' here - it's used for selection (a.b.c)
// URIs are handled separately by checking for '://' pattern
while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-'))
pos++;
std::string ident = input.substr(start, pos - start);
// Check if it's a URI (contains ://)
size_t scheme_end = ident.find("://");
if (scheme_end != std::string::npos && scheme_end > 0) {
tokens.push_back({Token::URI, ident, line, col});
col += ident.size();
return;
}
Token::Type type = Token::IDENT;
if (ident == "let")
type = Token::LET;
else if (ident == "in")
type = Token::IN;
else if (ident == "rec")
type = Token::REC;
else if (ident == "if")
type = Token::IF;
else if (ident == "then")
type = Token::THEN;
else if (ident == "else")
type = Token::ELSE;
else if (ident == "assert")
type = Token::ASSERT;
else if (ident == "with")
type = Token::WITH;
else if (ident == "inherit")
type = Token::INHERIT;
else if (ident == "import")
type = Token::IMPORT;
else if (ident == "true")
type = Token::BOOL;
else if (ident == "false")
type = Token::BOOL;
tokens.push_back({type, ident, line, col});
col += ident.size();
}
};
class Parser::Impl { class Parser::Impl {
public: public:
std::vector<Token> tokens; std::vector<Token> tokens;
@ -706,9 +85,9 @@ public:
bool expect(Token::Type type) { bool expect(Token::Type type) {
if (current().type != type) { if (current().type != type) {
std::cerr << "Expected token " << type << " but got " << current().type << " at " throw std::runtime_error(
<< current().line << ":" << current().col << "\n"; "Expected token " + std::to_string(type) + " but got " + std::to_string(current().type) +
return false; " at " + std::to_string(current().line) + ":" + std::to_string(current().col));
} }
advance(); advance();
return true; return true;
@ -718,7 +97,7 @@ public:
int get_precedence(Token::Type type) { int get_precedence(Token::Type type) {
switch (type) { switch (type) {
case Token::MERGE: case Token::MERGE:
return 1; // Low precedence - binds loosely, but must be > 0 to be recognized as binary op return 1; // low precedence - binds loosely, but must be > 0 to be recognized as binary op
case Token::OR: case Token::OR:
return 1; return 1;
case Token::AND: case Token::AND:
@ -942,9 +321,10 @@ public:
} else if (current().type == Token::IDENT || current().type == Token::INT || } else if (current().type == Token::IDENT || current().type == Token::INT ||
current().type == Token::FLOAT || current().type == Token::BOOL || current().type == Token::FLOAT || current().type == Token::BOOL ||
current().type == Token::PATH || current().type == Token::LOOKUP_PATH || current().type == Token::PATH || current().type == Token::LOOKUP_PATH ||
current().type == Token::URI || current().type == Token::LBRACKET) { current().type == Token::URI || current().type == Token::LBRACKET ||
current().type == Token::LBRACE) {
// Juxtaposition application: f x // Juxtaposition application: f x
// Parse the argument as a primary expression (which handles lists, etc.) // Parse the argument as a primary expression (which handles lists, attrsets, etc.)
auto arg = parse_expr3(); auto arg = parse_expr3();
left = std::make_shared<Node>(AppNode(left, arg)); left = std::make_shared<Node>(AppNode(left, arg));
} else { } else {
@ -1056,9 +436,8 @@ public:
return std::make_shared<Node>(ConstBoolNode(t.value == "true")); return std::make_shared<Node>(ConstBoolNode(t.value == "true"));
} }
std::cerr << "Unknown token: " << t.value << " (type " << t.type << ")\n"; throw std::runtime_error("Unknown token: " + t.value + " (type " + std::to_string(t.type) +
advance(); ")");
return std::make_shared<Node>(ConstNullNode());
} }
std::shared_ptr<Node> parse_attrs() { std::shared_ptr<Node> parse_attrs() {