diff --git a/src/irc/lexer.cpp b/src/irc/lexer.cpp new file mode 100644 index 0000000..0a8083b --- /dev/null +++ b/src/irc/lexer.cpp @@ -0,0 +1,562 @@ +#include "lexer.h" +#include + +namespace nix_irc { + +Lexer::Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {} + +std::vector Lexer::tokenize() { +#define TOKEN(t) \ + Token { \ + Token::t, "", line, col \ + } + + while (pos < input.size()) { + skip_whitespace(); + if (pos >= input.size()) + break; + + char c = input[pos]; + + if (c == '(') { + emit(TOKEN(LPAREN)); + } else if (c == ')') { + emit(TOKEN(RPAREN)); + } else if (c == '{') { + emit(TOKEN(LBRACE)); + } else if (c == '}') { + emit(TOKEN(RBRACE)); + } else if (c == '[') { + emit(TOKEN(LBRACKET)); + } else if (c == ']') { + emit(TOKEN(RBRACKET)); + } else if (c == ';') { + emit(TOKEN(SEMICOLON)); + } else if (c == ':') { + emit(TOKEN(COLON)); + } else if (c == '@') { + emit(TOKEN(AT)); + } else if (c == ',') { + emit(TOKEN(COMMA)); + } else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') { + tokenize_indented_string(); + } else if (c == '"') { + tokenize_string(); + } + // Two-char operators + else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') { + tokens.push_back(TOKEN(EQEQ)); + pos += 2; + col += 2; + } else if (c == '=') { + emit(TOKEN(EQUALS)); + } else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') { + tokens.push_back(TOKEN(NE)); + pos += 2; + col += 2; + } else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') { + tokens.push_back(TOKEN(LE)); + pos += 2; + col += 2; + } else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') { + tokens.push_back(TOKEN(GE)); + pos += 2; + col += 2; + } else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') { + tokens.push_back(TOKEN(CONCAT)); + pos += 2; + col += 2; + } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') { + tokens.push_back(TOKEN(MERGE)); + pos += 2; + col += 2; + } else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') { + tokens.push_back(TOKEN(AND)); + pos += 2; + col += 2; + } else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') { + tokens.push_back(TOKEN(OR)); + pos += 2; + col += 2; + } else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') { + tokens.push_back(TOKEN(IMPL)); + pos += 2; + col += 2; + } + // Single-char operators + else if (c == '+') { + emit(TOKEN(PLUS)); + } else if (c == '*') { + emit(TOKEN(STAR)); + } else if (c == '/') { + // Check if it's a path or division + if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) { + tokenize_path(); + } else { + emit(TOKEN(SLASH)); + } + } else if (c == '<') { + // Check for lookup path vs comparison operator + size_t end = pos + 1; + bool is_lookup_path = false; + + // Scan for valid lookup path characters until > + while (end < input.size() && (isalnum(input[end]) || input[end] == '-' || input[end] == '_' || + input[end] == '/' || input[end] == '.')) { + end++; + } + + // If we found > and there's content, it's a lookup path + if (end < input.size() && input[end] == '>' && end > pos + 1) { + std::string path = input.substr(pos + 1, end - pos - 1); + tokens.push_back({Token::LOOKUP_PATH, path, line, col}); + pos = end + 1; + col += (end - pos + 1); + is_lookup_path = true; + } + + if (!is_lookup_path) { + emit(TOKEN(LT)); + } + } else if (c == '>') { + emit(TOKEN(GT)); + } else if (c == '!') { + emit(TOKEN(NOT)); + } else if (c == '.') { + // Check for ellipsis (...) + if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') { + tokens.push_back(TOKEN(ELLIPSIS)); + pos += 3; + col += 3; + } else { + emit(TOKEN(DOT)); + } + } else if (c == '?') { + emit(TOKEN(QUESTION)); + } else if (c == '~') { + // Home-relative path ~/... + if (pos + 1 < input.size() && input[pos + 1] == '/') { + tokenize_home_path(); + } else { + // Just ~ by itself is an identifier + tokenize_ident(); + } + } else if (c == '-') { + // Check if it's a negative number or minus operator + if (pos + 1 < input.size() && isdigit(input[pos + 1])) { + // Check for negative float + if (pos + 2 < input.size() && input[pos + 2] == '.') { + tokenize_float(); + } else { + tokenize_int(); + } + } else { + emit(TOKEN(MINUS)); + } + } else if (isdigit(c)) { + // Check if it's a float (digit followed by '.') + if (pos + 1 < input.size() && input[pos + 1] == '.') { + tokenize_float(); + } else { + tokenize_int(); + } + } else if (isalpha(c)) { + // Check if it's a URI (contains ://) - look ahead + size_t lookahead = pos; + while (lookahead < input.size() && + (isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' || + input[lookahead] == '+' || input[lookahead] == '.')) + lookahead++; + std::string potential_scheme = input.substr(pos, lookahead - pos); + if (lookahead + 2 < input.size() && input[lookahead] == ':' && input[lookahead + 1] == '/' && + input[lookahead + 2] == '/') { + // It's a URI, consume the whole thing + tokenize_uri(); + } else { + tokenize_ident(); + } + } else { + pos++; + col++; + } + } + tokens.push_back({Token::EOF_, "", line, col}); + +#undef TOKEN + return tokens; +} + +void Lexer::emit(const Token& t) { + tokens.push_back(t); + pos++; + col++; +} + +void Lexer::skip_whitespace() { + while (pos < input.size()) { + char c = input[pos]; + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { + if (c == '\n') { + line++; + col = 1; + } else { + col++; + } + pos++; + } else if (c == '#') { + // Line comment - skip until newline + while (pos < input.size() && input[pos] != '\n') + pos++; + } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') { + // Block comment /* ... */ + // Note: Nix block comments do NOT nest + pos += 2; // Skip /* + while (pos + 1 < input.size()) { + if (input[pos] == '*' && input[pos + 1] == '/') { + pos += 2; // Skip */ + break; + } + if (input[pos] == '\n') { + line++; + col = 1; + } else { + col++; + } + pos++; + } + } else { + break; + } + } +} + +void Lexer::tokenize_string() { + pos++; + std::string s; + bool has_interp = false; + + while (pos < input.size() && input[pos] != '"') { + if (input[pos] == '\\' && pos + 1 < input.size()) { + pos++; + switch (input[pos]) { + case 'n': + s += '\n'; + break; + case 't': + s += '\t'; + break; + case 'r': + s += '\r'; + break; + case '"': + s += '"'; + break; + case '\\': + s += '\\'; + break; + case '$': + s += '$'; + break; // Escaped $ + default: + s += input[pos]; + break; + } + pos++; + } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') { + // Found interpolation marker + has_interp = true; + s += input[pos]; // Keep $ in raw string + pos++; + } else { + s += input[pos]; + pos++; + } + } + pos++; + + Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING; + tokens.push_back({type, s, line, col}); + col += s.size() + 2; +} + +void Lexer::tokenize_indented_string() { + pos += 2; // Skip opening '' + std::string raw_content; + bool has_interp = false; + size_t start_line = line; + + // Collect raw content until closing '' + while (pos < input.size()) { + // Check for escape sequences + if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') { + // Check if it's an escape or the closing delimiter + if (pos + 2 < input.size() && input[pos + 2] == '\'') { + // ''' -> escape for '' + raw_content += "''"; + pos += 3; + continue; + } else if (pos + 2 < input.size() && input[pos + 2] == '$') { + // ''$ -> escape for $ + raw_content += '$'; + pos += 3; + continue; + } else if (pos + 2 < input.size() && input[pos + 2] == '\\') { + // ''\ -> check what follows + if (pos + 3 < input.size()) { + char next = input[pos + 3]; + if (next == 'n') { + raw_content += '\n'; + pos += 4; + continue; + } else if (next == 'r') { + raw_content += '\r'; + pos += 4; + continue; + } else if (next == 't') { + raw_content += '\t'; + pos += 4; + continue; + } else if (next == ' ' || next == '\t') { + // ''\ before whitespace - preserve the whitespace by prepending a marker + // We use a special escape sequence that won't appear in normal text + raw_content += "\x1F\x1F"; // Unit separator pair as marker for preserved whitespace + raw_content += next; + pos += 4; + continue; + } + } + // Default: literal backslash + raw_content += '\\'; + pos += 3; + continue; + } else { + // Just closing '' + pos += 2; + break; + } + } + + // Check for interpolation + if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') { + has_interp = true; + raw_content += input[pos]; + pos++; + if (input[pos] == '\n') { + line++; + } + continue; + } + + // Track newlines + if (input[pos] == '\n') { + line++; + raw_content += input[pos]; + pos++; + } else { + raw_content += input[pos]; + pos++; + } + } + + // Strip common indentation + std::string stripped = strip_indentation(raw_content); + + Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING; + tokens.push_back({type, stripped, start_line, col}); +} + +std::string Lexer::strip_indentation(const std::string& s) { + if (s.empty()) + return s; + + // Split into lines + std::vector lines; + std::string current_line; + for (char c : s) { + if (c == '\n') { + lines.push_back(current_line); + current_line.clear(); + } else { + current_line += c; + } + } + if (!current_line.empty() || (!s.empty() && s.back() == '\n')) { + lines.push_back(current_line); + } + + // Find minimum indentation (spaces/tabs at start of non-empty lines) + // \x1F\x1F marker indicates preserved whitespace (from ''\ escape) + size_t min_indent = std::string::npos; + for (const auto& line : lines) { + if (line.empty()) + continue; // Skip empty lines when calculating indentation + size_t indent = 0; + for (size_t i = 0; i < line.size(); i++) { + char c = line[i]; + // If we hit the preserved whitespace marker, stop counting indentation + if (c == '\x1F' && i + 1 < line.size() && line[i + 1] == '\x1F') { + break; + } + if (c == ' ' || c == '\t') + indent++; + else + break; + } + if (indent < min_indent) + min_indent = indent; + } + + if (min_indent == std::string::npos) + min_indent = 0; + + // Strip min_indent from all lines and remove \x1F\x1F markers + std::string result; + for (size_t i = 0; i < lines.size(); i++) { + const auto& line = lines[i]; + if (line.empty()) { + // Preserve empty lines + if (i + 1 < lines.size()) + result += '\n'; + } else { + // Strip indentation, being careful about \x1F\x1F markers + size_t skip = 0; + size_t pos = 0; + while (skip < min_indent && pos < line.size()) { + if (line[pos] == '\x1F' && pos + 1 < line.size() && line[pos + 1] == '\x1F') { + // Hit preserved whitespace marker - don't strip any more + break; + } + skip++; + pos++; + } + + // Add the rest of the line, removing \x1F\x1F markers + for (size_t j = pos; j < line.size(); j++) { + if (line[j] == '\x1F' && j + 1 < line.size() && line[j + 1] == '\x1F') { + j++; // Skip both marker bytes + continue; + } + result += line[j]; + } + + if (i + 1 < lines.size()) + result += '\n'; + } + } + + return result; +} + +void Lexer::tokenize_path() { + size_t start = pos; + while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' && + input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' && + input[pos] != ';') { + pos++; + } + std::string path = input.substr(start, pos - start); + tokens.push_back({Token::PATH, path, line, col}); + col += path.size(); +} + +void Lexer::tokenize_home_path() { + size_t start = pos; + pos++; // Skip ~ + if (pos < input.size() && input[pos] == '/') { + // Home-relative path ~/something + while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' && + input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' && + input[pos] != ';') { + pos++; + } + } + std::string path = input.substr(start, pos - start); + tokens.push_back({Token::PATH, path, line, col}); + col += path.size(); +} + +void Lexer::tokenize_int() { + size_t start = pos; + if (input[pos] == '-') + pos++; + while (pos < input.size() && isdigit(input[pos])) + pos++; + std::string num = input.substr(start, pos - start); + tokens.push_back({Token::INT, num, line, col}); + col += num.size(); +} + +void Lexer::tokenize_float() { + size_t start = pos; + if (input[pos] == '-') + pos++; + while (pos < input.size() && isdigit(input[pos])) + pos++; + if (pos < input.size() && input[pos] == '.') { + pos++; + while (pos < input.size() && isdigit(input[pos])) + pos++; + } + std::string num = input.substr(start, pos - start); + tokens.push_back({Token::FLOAT, num, line, col}); + col += num.size(); +} + +void Lexer::tokenize_uri() { + size_t start = pos; + while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' && + input[pos] != ';') { + pos++; + } + std::string uri = input.substr(start, pos - start); + tokens.push_back({Token::URI, uri, line, col}); + col += uri.size(); +} + +void Lexer::tokenize_ident() { + size_t start = pos; + // Note: Don't include '.' here - it's used for selection (a.b.c) + // URIs are handled separately by checking for '://' pattern + while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-')) + pos++; + std::string ident = input.substr(start, pos - start); + + // Check if it's a URI (contains ://) + size_t scheme_end = ident.find("://"); + if (scheme_end != std::string::npos && scheme_end > 0) { + tokens.push_back({Token::URI, ident, line, col}); + col += ident.size(); + return; + } + + Token::Type type = Token::IDENT; + if (ident == "let") + type = Token::LET; + else if (ident == "in") + type = Token::IN; + else if (ident == "rec") + type = Token::REC; + else if (ident == "if") + type = Token::IF; + else if (ident == "then") + type = Token::THEN; + else if (ident == "else") + type = Token::ELSE; + else if (ident == "assert") + type = Token::ASSERT; + else if (ident == "with") + type = Token::WITH; + else if (ident == "inherit") + type = Token::INHERIT; + else if (ident == "import") + type = Token::IMPORT; + else if (ident == "true") + type = Token::BOOL; + else if (ident == "false") + type = Token::BOOL; + + tokens.push_back({type, ident, line, col}); + col += ident.size(); +} + +} // namespace nix_irc diff --git a/src/irc/lexer.h b/src/irc/lexer.h new file mode 100644 index 0000000..f83087b --- /dev/null +++ b/src/irc/lexer.h @@ -0,0 +1,94 @@ +#pragma once + +#include +#include + +namespace nix_irc { + +struct Token { + enum Type { + LPAREN, + RPAREN, + LBRACE, + RBRACE, + LBRACKET, + RBRACKET, + IDENT, + STRING, + STRING_INTERP, + INDENTED_STRING, + INDENTED_STRING_INTERP, + PATH, + LOOKUP_PATH, + INT, + FLOAT, + URI, + BOOL, + LET, + IN, + REC, + IF, + THEN, + ELSE, + ASSERT, + WITH, + INHERIT, + IMPORT, + DOT, + SEMICOLON, + COLON, + EQUALS, + AT, + COMMA, + QUESTION, + ELLIPSIS, + // Operators + PLUS, + MINUS, + STAR, + SLASH, + CONCAT, + MERGE, + EQEQ, + NE, + LT, + GT, + LE, + GE, + AND, + OR, + IMPL, + NOT, + EOF_ + } type; + std::string value; + size_t line; + size_t col; +}; + +class Lexer { +public: + Lexer(const std::string& input); + std::vector tokenize(); + +private: + std::vector tokens; + const std::string& input; + size_t pos; + size_t line; + size_t col; + + void emit(const Token& t); + void skip_whitespace(); + void tokenize_string(); + void tokenize_indented_string(); + std::string strip_indentation(const std::string& s); + void tokenize_path(); + void tokenize_home_path(); + void tokenize_int(); + void tokenize_float(); + void tokenize_uri(); + void tokenize_ident(); +}; + +} // namespace nix_irc diff --git a/src/irc/parser.cpp b/src/irc/parser.cpp index 8a37a22..9b05b55 100644 --- a/src/irc/parser.cpp +++ b/src/irc/parser.cpp @@ -1,4 +1,5 @@ #include "parser.h" +#include "lexer.h" #include #include #include @@ -59,628 +60,6 @@ static std::pair run_command(const std::string& cmd) { return {result, ""}; } -struct Token { - enum Type { - LPAREN, - RPAREN, - LBRACE, - RBRACE, - LBRACKET, - RBRACKET, - IDENT, - STRING, - STRING_INTERP, - INDENTED_STRING, - INDENTED_STRING_INTERP, - PATH, - LOOKUP_PATH, - INT, - FLOAT, - URI, - BOOL, - LET, - IN, - REC, - IF, - THEN, - ELSE, - ASSERT, - WITH, - INHERIT, - IMPORT, - DOT, - SEMICOLON, - COLON, - EQUALS, - AT, - COMMA, - QUESTION, - ELLIPSIS, - // Operators - PLUS, - MINUS, - STAR, - SLASH, - CONCAT, - MERGE, - EQEQ, - NE, - LT, - GT, - LE, - GE, - AND, - OR, - IMPL, - NOT, - EOF_ - } type; - std::string value; - size_t line; - size_t col; -}; - -class Lexer { -public: - Lexer(const std::string& input) : input(input), pos(0), line(1), col(1) {} - - std::vector tokenize() { -#define TOKEN(t) \ - Token { Token::t, "", line, col } - - while (pos < input.size()) { - skip_whitespace(); - if (pos >= input.size()) - break; - - char c = input[pos]; - - if (c == '(') { - emit(TOKEN(LPAREN)); - } else if (c == ')') { - emit(TOKEN(RPAREN)); - } else if (c == '{') { - emit(TOKEN(LBRACE)); - } else if (c == '}') { - emit(TOKEN(RBRACE)); - } else if (c == '[') { - emit(TOKEN(LBRACKET)); - } else if (c == ']') { - emit(TOKEN(RBRACKET)); - } else if (c == ';') { - emit(TOKEN(SEMICOLON)); - } else if (c == ':') { - emit(TOKEN(COLON)); - } else if (c == '@') { - emit(TOKEN(AT)); - } else if (c == ',') { - emit(TOKEN(COMMA)); - } else if (c == '\'' && pos + 1 < input.size() && input[pos + 1] == '\'') { - tokenize_indented_string(); - } else if (c == '"') { - tokenize_string(); - } - // Two-char operators - else if (c == '=' && pos + 1 < input.size() && input[pos + 1] == '=') { - tokens.push_back(TOKEN(EQEQ)); - pos += 2; - col += 2; - } else if (c == '=') { - emit(TOKEN(EQUALS)); - } else if (c == '!' && pos + 1 < input.size() && input[pos + 1] == '=') { - tokens.push_back(TOKEN(NE)); - pos += 2; - col += 2; - } else if (c == '<' && pos + 1 < input.size() && input[pos + 1] == '=') { - tokens.push_back(TOKEN(LE)); - pos += 2; - col += 2; - } else if (c == '>' && pos + 1 < input.size() && input[pos + 1] == '=') { - tokens.push_back(TOKEN(GE)); - pos += 2; - col += 2; - } else if (c == '+' && pos + 1 < input.size() && input[pos + 1] == '+') { - tokens.push_back(TOKEN(CONCAT)); - pos += 2; - col += 2; - } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '/') { - tokens.push_back(TOKEN(MERGE)); - pos += 2; - col += 2; - } else if (c == '&' && pos + 1 < input.size() && input[pos + 1] == '&') { - tokens.push_back(TOKEN(AND)); - pos += 2; - col += 2; - } else if (c == '|' && pos + 1 < input.size() && input[pos + 1] == '|') { - tokens.push_back(TOKEN(OR)); - pos += 2; - col += 2; - } else if (c == '-' && pos + 1 < input.size() && input[pos + 1] == '>') { - tokens.push_back(TOKEN(IMPL)); - pos += 2; - col += 2; - } - // Single-char operators - else if (c == '+') { - emit(TOKEN(PLUS)); - } else if (c == '*') { - emit(TOKEN(STAR)); - } else if (c == '/') { - // Check if it's a path or division - if (pos + 1 < input.size() && (isalnum(input[pos + 1]) || input[pos + 1] == '.')) { - tokenize_path(); - } else { - emit(TOKEN(SLASH)); - } - } else if (c == '<') { - // Check for lookup path vs comparison operator - size_t end = pos + 1; - bool is_lookup_path = false; - - // Scan for valid lookup path characters until > - while (end < input.size() && - (isalnum(input[end]) || input[end] == '-' || input[end] == '_' || - input[end] == '/' || input[end] == '.')) { - end++; - } - - // If we found > and there's content, it's a lookup path - if (end < input.size() && input[end] == '>' && end > pos + 1) { - std::string path = input.substr(pos + 1, end - pos - 1); - tokens.push_back({Token::LOOKUP_PATH, path, line, col}); - pos = end + 1; - col += (end - pos + 1); - is_lookup_path = true; - } - - if (!is_lookup_path) { - emit(TOKEN(LT)); - } - } else if (c == '>') { - emit(TOKEN(GT)); - } else if (c == '!') { - emit(TOKEN(NOT)); - } else if (c == '.') { - // Check for ellipsis (...) - if (pos + 2 < input.size() && input[pos + 1] == '.' && input[pos + 2] == '.') { - tokens.push_back(TOKEN(ELLIPSIS)); - pos += 3; - col += 3; - } else { - emit(TOKEN(DOT)); - } - } else if (c == '?') { - emit(TOKEN(QUESTION)); - } else if (c == '~') { - // Home-relative path ~/... - if (pos + 1 < input.size() && input[pos + 1] == '/') { - tokenize_home_path(); - } else { - // Just ~ by itself is an identifier - tokenize_ident(); - } - } else if (c == '-') { - // Check if it's a negative number or minus operator - if (pos + 1 < input.size() && isdigit(input[pos + 1])) { - // Check for negative float - if (pos + 2 < input.size() && input[pos + 2] == '.') { - tokenize_float(); - } else { - tokenize_int(); - } - } else { - emit(TOKEN(MINUS)); - } - } else if (isdigit(c)) { - // Check if it's a float (digit followed by '.') - if (pos + 1 < input.size() && input[pos + 1] == '.') { - tokenize_float(); - } else { - tokenize_int(); - } - } else if (isalpha(c)) { - // Check if it's a URI (contains ://) - look ahead - size_t lookahead = pos; - while (lookahead < input.size() && - (isalnum(input[lookahead]) || input[lookahead] == '_' || input[lookahead] == '-' || - input[lookahead] == '+' || input[lookahead] == '.')) - lookahead++; - std::string potential_scheme = input.substr(pos, lookahead - pos); - if (lookahead + 2 < input.size() && input[lookahead] == ':' && - input[lookahead + 1] == '/' && input[lookahead + 2] == '/') { - // It's a URI, consume the whole thing - tokenize_uri(); - } else { - tokenize_ident(); - } - } else { - pos++; - col++; - } - } - tokens.push_back({Token::EOF_, "", line, col}); - -#undef TOKEN - return tokens; - } - -private: - std::vector tokens; - const std::string& input; - size_t pos; - size_t line; - size_t col; - - void emit(const Token& t) { - tokens.push_back(t); - pos++; - col++; - } - - void skip_whitespace() { - while (pos < input.size()) { - char c = input[pos]; - if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { - if (c == '\n') { - line++; - col = 1; - } else { - col++; - } - pos++; - } else if (c == '#') { - // Line comment - skip until newline - while (pos < input.size() && input[pos] != '\n') - pos++; - } else if (c == '/' && pos + 1 < input.size() && input[pos + 1] == '*') { - // Block comment /* ... */ - // Note: Nix block comments do NOT nest - pos += 2; // Skip /* - while (pos + 1 < input.size()) { - if (input[pos] == '*' && input[pos + 1] == '/') { - pos += 2; // Skip */ - break; - } - if (input[pos] == '\n') { - line++; - col = 1; - } else { - col++; - } - pos++; - } - } else { - break; - } - } - } - - void tokenize_string() { - pos++; - std::string s; - bool has_interp = false; - - while (pos < input.size() && input[pos] != '"') { - if (input[pos] == '\\' && pos + 1 < input.size()) { - pos++; - switch (input[pos]) { - case 'n': - s += '\n'; - break; - case 't': - s += '\t'; - break; - case 'r': - s += '\r'; - break; - case '"': - s += '"'; - break; - case '\\': - s += '\\'; - break; - case '$': - s += '$'; - break; // Escaped $ - default: - s += input[pos]; - break; - } - pos++; - } else if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') { - // Found interpolation marker - has_interp = true; - s += input[pos]; // Keep $ in raw string - pos++; - } else { - s += input[pos]; - pos++; - } - } - pos++; - - Token::Type type = has_interp ? Token::STRING_INTERP : Token::STRING; - tokens.push_back({type, s, line, col}); - col += s.size() + 2; - } - - void tokenize_indented_string() { - pos += 2; // Skip opening '' - std::string raw_content; - bool has_interp = false; - size_t start_line = line; - - // Collect raw content until closing '' - while (pos < input.size()) { - // Check for escape sequences - if (pos + 1 < input.size() && input[pos] == '\'' && input[pos + 1] == '\'') { - // Check if it's an escape or the closing delimiter - if (pos + 2 < input.size() && input[pos + 2] == '\'') { - // ''' -> escape for '' - raw_content += "''"; - pos += 3; - continue; - } else if (pos + 2 < input.size() && input[pos + 2] == '$') { - // ''$ -> escape for $ - raw_content += '$'; - pos += 3; - continue; - } else if (pos + 2 < input.size() && input[pos + 2] == '\\') { - // ''\ -> check what follows - if (pos + 3 < input.size()) { - char next = input[pos + 3]; - if (next == 'n') { - raw_content += '\n'; - pos += 4; - continue; - } else if (next == 'r') { - raw_content += '\r'; - pos += 4; - continue; - } else if (next == 't') { - raw_content += '\t'; - pos += 4; - continue; - } else if (next == ' ' || next == '\t') { - // ''\ before whitespace - preserve the whitespace (mark it specially) - raw_content += "\x01"; // Use control char as marker for preserved whitespace - raw_content += next; - pos += 4; - continue; - } - } - // Default: literal backslash - raw_content += '\\'; - pos += 3; - continue; - } else { - // Just closing '' - pos += 2; - break; - } - } - - // Check for interpolation - if (input[pos] == '$' && pos + 1 < input.size() && input[pos + 1] == '{') { - has_interp = true; - raw_content += input[pos]; - pos++; - if (input[pos] == '\n') { - line++; - } - continue; - } - - // Track newlines - if (input[pos] == '\n') { - line++; - raw_content += input[pos]; - pos++; - } else { - raw_content += input[pos]; - pos++; - } - } - - // Strip common indentation - std::string stripped = strip_indentation(raw_content); - - Token::Type type = has_interp ? Token::INDENTED_STRING_INTERP : Token::INDENTED_STRING; - tokens.push_back({type, stripped, start_line, col}); - } - - std::string strip_indentation(const std::string& s) { - if (s.empty()) - return s; - - // Split into lines - std::vector lines; - std::string current_line; - for (char c : s) { - if (c == '\n') { - lines.push_back(current_line); - current_line.clear(); - } else { - current_line += c; - } - } - if (!current_line.empty() || (!s.empty() && s.back() == '\n')) { - lines.push_back(current_line); - } - - // Find minimum indentation (spaces/tabs at start of non-empty lines) - // \x01 marker indicates preserved whitespace (from ''\ escape) - size_t min_indent = std::string::npos; - for (const auto& line : lines) { - if (line.empty()) - continue; // Skip empty lines when calculating indentation - size_t indent = 0; - for (size_t i = 0; i < line.size(); i++) { - char c = line[i]; - // If we hit the preserved whitespace marker, stop counting indentation - if (c == '\x01') - break; - if (c == ' ' || c == '\t') - indent++; - else - break; - } - if (indent < min_indent) - min_indent = indent; - } - - if (min_indent == std::string::npos) - min_indent = 0; - - // Strip min_indent from all lines and remove \x01 markers - std::string result; - for (size_t i = 0; i < lines.size(); i++) { - const auto& line = lines[i]; - if (line.empty()) { - // Preserve empty lines - if (i + 1 < lines.size()) - result += '\n'; - } else { - // Strip indentation, being careful about \x01 markers - size_t skip = 0; - size_t pos = 0; - while (skip < min_indent && pos < line.size()) { - if (line[pos] == '\x01') { - // Hit preserved whitespace marker - don't strip any more - break; - } - skip++; - pos++; - } - - // Add the rest of the line, removing \x01 markers - for (size_t j = pos; j < line.size(); j++) { - if (line[j] != '\x01') { - result += line[j]; - } - } - - if (i + 1 < lines.size()) - result += '\n'; - } - } - - return result; - } - - void tokenize_path() { - size_t start = pos; - while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' && - input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' && - input[pos] != ';') { - pos++; - } - std::string path = input.substr(start, pos - start); - tokens.push_back({Token::PATH, path, line, col}); - col += path.size(); - } - - void tokenize_home_path() { - size_t start = pos; - pos++; // Skip ~ - if (pos < input.size() && input[pos] == '/') { - // Home-relative path ~/something - while (pos < input.size() && !isspace(input[pos]) && input[pos] != '(' && input[pos] != ')' && - input[pos] != '{' && input[pos] != '}' && input[pos] != '[' && input[pos] != ']' && - input[pos] != ';') { - pos++; - } - } - std::string path = input.substr(start, pos - start); - tokens.push_back({Token::PATH, path, line, col}); - col += path.size(); - } - - void tokenize_int() { - size_t start = pos; - if (input[pos] == '-') - pos++; - while (pos < input.size() && isdigit(input[pos])) - pos++; - std::string num = input.substr(start, pos - start); - tokens.push_back({Token::INT, num, line, col}); - col += num.size(); - } - - void tokenize_float() { - size_t start = pos; - if (input[pos] == '-') - pos++; - while (pos < input.size() && isdigit(input[pos])) - pos++; - if (pos < input.size() && input[pos] == '.') { - pos++; - while (pos < input.size() && isdigit(input[pos])) - pos++; - } - std::string num = input.substr(start, pos - start); - tokens.push_back({Token::FLOAT, num, line, col}); - col += num.size(); - } - - void tokenize_uri() { - size_t start = pos; - while (pos < input.size() && !isspace(input[pos]) && input[pos] != ')' && input[pos] != ']' && - input[pos] != ';') { - pos++; - } - std::string uri = input.substr(start, pos - start); - tokens.push_back({Token::URI, uri, line, col}); - col += uri.size(); - } - - void tokenize_ident() { - size_t start = pos; - // Note: Don't include '.' here - it's used for selection (a.b.c) - // URIs are handled separately by checking for '://' pattern - while (pos < input.size() && (isalnum(input[pos]) || input[pos] == '_' || input[pos] == '-')) - pos++; - std::string ident = input.substr(start, pos - start); - - // Check if it's a URI (contains ://) - size_t scheme_end = ident.find("://"); - if (scheme_end != std::string::npos && scheme_end > 0) { - tokens.push_back({Token::URI, ident, line, col}); - col += ident.size(); - return; - } - - Token::Type type = Token::IDENT; - if (ident == "let") - type = Token::LET; - else if (ident == "in") - type = Token::IN; - else if (ident == "rec") - type = Token::REC; - else if (ident == "if") - type = Token::IF; - else if (ident == "then") - type = Token::THEN; - else if (ident == "else") - type = Token::ELSE; - else if (ident == "assert") - type = Token::ASSERT; - else if (ident == "with") - type = Token::WITH; - else if (ident == "inherit") - type = Token::INHERIT; - else if (ident == "import") - type = Token::IMPORT; - else if (ident == "true") - type = Token::BOOL; - else if (ident == "false") - type = Token::BOOL; - - tokens.push_back({type, ident, line, col}); - col += ident.size(); - } -}; - class Parser::Impl { public: std::vector tokens; @@ -706,9 +85,9 @@ public: bool expect(Token::Type type) { if (current().type != type) { - std::cerr << "Expected token " << type << " but got " << current().type << " at " - << current().line << ":" << current().col << "\n"; - return false; + throw std::runtime_error( + "Expected token " + std::to_string(type) + " but got " + std::to_string(current().type) + + " at " + std::to_string(current().line) + ":" + std::to_string(current().col)); } advance(); return true; @@ -718,7 +97,7 @@ public: int get_precedence(Token::Type type) { switch (type) { case Token::MERGE: - return 1; // Low precedence - binds loosely, but must be > 0 to be recognized as binary op + return 1; // low precedence - binds loosely, but must be > 0 to be recognized as binary op case Token::OR: return 1; case Token::AND: @@ -942,9 +321,10 @@ public: } else if (current().type == Token::IDENT || current().type == Token::INT || current().type == Token::FLOAT || current().type == Token::BOOL || current().type == Token::PATH || current().type == Token::LOOKUP_PATH || - current().type == Token::URI || current().type == Token::LBRACKET) { + current().type == Token::URI || current().type == Token::LBRACKET || + current().type == Token::LBRACE) { // Juxtaposition application: f x - // Parse the argument as a primary expression (which handles lists, etc.) + // Parse the argument as a primary expression (which handles lists, attrsets, etc.) auto arg = parse_expr3(); left = std::make_shared(AppNode(left, arg)); } else { @@ -1056,9 +436,8 @@ public: return std::make_shared(ConstBoolNode(t.value == "true")); } - std::cerr << "Unknown token: " << t.value << " (type " << t.type << ")\n"; - advance(); - return std::make_shared(ConstNullNode()); + throw std::runtime_error("Unknown token: " + t.value + " (type " + std::to_string(t.type) + + ")"); } std::shared_ptr parse_attrs() {