Remove command parsing Reader and just track a ParserState
The Reader abstraction was leading to suboptimal code by encouraging decoding utf8 separately from advancing to next codepoint.
This commit is contained in:
parent
86c5e8f75b
commit
2289f350df
|
@ -87,31 +87,6 @@ struct parse_error : runtime_error
|
||||||
: runtime_error{format("parse error: {}", error)} {}
|
: runtime_error{format("parse error: {}", error)} {}
|
||||||
};
|
};
|
||||||
|
|
||||||
Codepoint Reader::operator*() const
|
|
||||||
{
|
|
||||||
kak_assert(pos < str.end());
|
|
||||||
return utf8::codepoint(pos, str.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
Codepoint Reader::peek_next() const
|
|
||||||
{
|
|
||||||
return utf8::codepoint(utf8::next(pos, str.end()), str.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
Reader& Reader::operator++()
|
|
||||||
{
|
|
||||||
kak_assert(pos < str.end());
|
|
||||||
utf8::to_next(pos, str.end());
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
Reader& Reader::next_byte()
|
|
||||||
{
|
|
||||||
kak_assert(pos < str.end());
|
|
||||||
++pos;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -120,84 +95,86 @@ bool is_command_separator(Codepoint c)
|
||||||
return c == ';' or c == '\n';
|
return c == ';' or c == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
struct QuotedResult
|
struct ParseResult
|
||||||
{
|
{
|
||||||
String content;
|
String content;
|
||||||
bool terminated;
|
bool terminated;
|
||||||
};
|
};
|
||||||
|
|
||||||
QuotedResult parse_quoted(Reader& reader, Codepoint delimiter)
|
ParseResult parse_quoted(ParseState& state, Codepoint delimiter)
|
||||||
{
|
{
|
||||||
auto beg = reader.pos;
|
const char* beg = state.pos;
|
||||||
|
const char* end = state.str.end();
|
||||||
String str;
|
String str;
|
||||||
|
|
||||||
while (reader)
|
while (state.pos != end)
|
||||||
{
|
{
|
||||||
const Codepoint c = *reader;
|
const char* cur = state.pos;
|
||||||
|
const Codepoint c = utf8::read_codepoint(state.pos, end);
|
||||||
if (c == delimiter)
|
if (c == delimiter)
|
||||||
{
|
{
|
||||||
if (reader.peek_next() != delimiter)
|
auto next = state.pos;
|
||||||
|
if (utf8::read_codepoint(next, end) != delimiter)
|
||||||
{
|
{
|
||||||
str += reader.substr_from(beg);
|
if (str.empty())
|
||||||
++reader;
|
return {String{String::NoCopy{}, {beg, cur}}, true};
|
||||||
|
|
||||||
|
str += StringView{beg, cur};
|
||||||
return {str, true};
|
return {str, true};
|
||||||
}
|
}
|
||||||
str += (++reader).substr_from(beg);
|
str += StringView{beg, state.pos};
|
||||||
beg = reader.pos+1;
|
state.pos = beg = next;
|
||||||
}
|
}
|
||||||
++reader;
|
|
||||||
}
|
}
|
||||||
if (beg < reader.str.end())
|
if (beg < end)
|
||||||
str += reader.substr_from(beg);
|
str += StringView{beg, end};
|
||||||
return {str, false};
|
return {str, false};
|
||||||
}
|
}
|
||||||
|
|
||||||
QuotedResult parse_quoted_balanced(Reader& reader, char opening_delimiter,
|
ParseResult parse_quoted_balanced(ParseState& state, char opening_delimiter, char closing_delimiter)
|
||||||
char closing_delimiter)
|
|
||||||
{
|
{
|
||||||
kak_assert(utf8::codepoint(utf8::previous(reader.pos, reader.str.begin()),
|
int level = 1;
|
||||||
reader.str.end()) == opening_delimiter);
|
const char* pos = state.pos;
|
||||||
int level = 0;
|
const char* beg = pos;
|
||||||
auto start = reader.pos;
|
const char* end = state.str.end();
|
||||||
while (reader)
|
while (pos != end)
|
||||||
{
|
{
|
||||||
const char c = *reader.pos;
|
const char c = *pos++;
|
||||||
if (c == opening_delimiter)
|
if (c == opening_delimiter)
|
||||||
++level;
|
++level;
|
||||||
else if (c == closing_delimiter and level-- == 0)
|
else if (c == closing_delimiter and --level == 0)
|
||||||
{
|
break;
|
||||||
auto content = reader.substr_from(start);
|
|
||||||
reader.next_byte();
|
|
||||||
return {String{String::NoCopy{}, content}, true};
|
|
||||||
}
|
|
||||||
reader.next_byte();
|
|
||||||
}
|
}
|
||||||
return {String{String::NoCopy{}, reader.substr_from(start)}, false};
|
state.pos = pos;
|
||||||
|
const bool terminated = (level == 0);
|
||||||
|
return {String{String::NoCopy{}, {beg, pos - terminated}}, terminated};
|
||||||
}
|
}
|
||||||
|
|
||||||
String parse_unquoted(Reader& reader)
|
String parse_unquoted(ParseState& state)
|
||||||
{
|
{
|
||||||
auto beg = reader.pos;
|
const char* beg = state.pos;
|
||||||
|
const char* end = state.str.end();
|
||||||
|
|
||||||
String str;
|
String str;
|
||||||
|
|
||||||
while (reader)
|
while (state.pos != end)
|
||||||
{
|
{
|
||||||
const char c = *reader.pos;
|
const char c = *state.pos;
|
||||||
if (is_command_separator(c) or is_horizontal_blank(c))
|
if (is_command_separator(c) or is_horizontal_blank(c))
|
||||||
{
|
{
|
||||||
str += reader.substr_from(beg);
|
str += StringView{beg, state.pos};
|
||||||
if (reader.pos != reader.str.begin() and *(reader.pos - 1) == '\\')
|
if (state.pos != beg and *(state.pos - 1) == '\\')
|
||||||
{
|
{
|
||||||
str.back() = c;
|
str.back() = c;
|
||||||
beg = reader.pos+1;
|
beg = state.pos+1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
reader.next_byte();
|
++state.pos;
|
||||||
}
|
}
|
||||||
if (beg < reader.str.end())
|
if (beg < end)
|
||||||
str += reader.substr_from(beg);
|
str += StringView{beg, end};
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,20 +200,20 @@ Token::Type token_type(StringView type_name, bool throw_on_invalid)
|
||||||
return Token::Type::RawQuoted;
|
return Token::Type::RawQuoted;
|
||||||
}
|
}
|
||||||
|
|
||||||
void skip_blanks_and_comments(Reader& reader)
|
void skip_blanks_and_comments(ParseState& state)
|
||||||
{
|
{
|
||||||
while (reader)
|
while (state)
|
||||||
{
|
{
|
||||||
const Codepoint c = *reader.pos;
|
const Codepoint c = *state.pos;
|
||||||
if (is_horizontal_blank(c))
|
if (is_horizontal_blank(c))
|
||||||
reader.next_byte();
|
++state.pos;
|
||||||
else if (c == '\\' and reader.pos + 1 != reader.str.end() and
|
else if (c == '\\' and state.pos + 1 != state.str.end() and
|
||||||
*(reader.pos + 1) == '\n')
|
state.pos[1] == '\n')
|
||||||
reader.next_byte().next_byte();
|
state.pos += 2;
|
||||||
else if (c == '#')
|
else if (c == '#')
|
||||||
{
|
{
|
||||||
while (reader and *reader != '\n')
|
while (state and *state.pos != '\n')
|
||||||
reader.next_byte();
|
++state.pos;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
|
@ -259,17 +236,16 @@ BufferCoord compute_coord(StringView s)
|
||||||
return coord;
|
return coord;
|
||||||
}
|
}
|
||||||
|
|
||||||
Token parse_percent_token(Reader& reader, bool throw_on_unterminated)
|
Token parse_percent_token(ParseState& state, bool throw_on_unterminated)
|
||||||
{
|
{
|
||||||
kak_assert(*reader == '%');
|
kak_assert(state.pos[-1] == '%');
|
||||||
++reader;
|
const auto type_start = state.pos;
|
||||||
|
while (state and *state.pos >= 'a' and *state.pos <= 'z')
|
||||||
|
++state.pos;
|
||||||
|
StringView type_name{type_start, state.pos};
|
||||||
|
|
||||||
const auto type_start = reader.pos;
|
const Codepoint opening_delimiter = utf8::read_codepoint(state.pos, state.str.end());
|
||||||
while (reader and iswalpha(*reader))
|
if (not state or iswalpha(opening_delimiter))
|
||||||
++reader;
|
|
||||||
StringView type_name = reader.substr_from(type_start);
|
|
||||||
|
|
||||||
if (not reader or is_blank(*reader))
|
|
||||||
{
|
{
|
||||||
if (throw_on_unterminated)
|
if (throw_on_unterminated)
|
||||||
throw parse_error{format("expected a string delimiter after '%{}'",
|
throw parse_error{format("expected a string delimiter after '%{}'",
|
||||||
|
@ -279,45 +255,41 @@ Token parse_percent_token(Reader& reader, bool throw_on_unterminated)
|
||||||
|
|
||||||
Token::Type type = token_type(type_name, throw_on_unterminated);
|
Token::Type type = token_type(type_name, throw_on_unterminated);
|
||||||
|
|
||||||
constexpr struct CharPair { Codepoint opening; Codepoint closing; } matching_pairs[] = {
|
constexpr struct CharPair { char opening; char closing; } matching_pairs[] = {
|
||||||
{ '(', ')' }, { '[', ']' }, { '{', '}' }, { '<', '>' }
|
{ '(', ')' }, { '[', ']' }, { '{', '}' }, { '<', '>' }
|
||||||
};
|
};
|
||||||
|
|
||||||
const Codepoint opening_delimiter = *reader;
|
auto start = state.pos;
|
||||||
++reader;
|
const ByteCount byte_pos = start - state.str.begin();
|
||||||
auto start = reader.pos;
|
|
||||||
|
|
||||||
auto it = find_if(matching_pairs, [opening_delimiter](const CharPair& cp)
|
if (auto it = find_if(matching_pairs, [=](const CharPair& cp) { return opening_delimiter == cp.opening; });
|
||||||
{ return opening_delimiter == cp.opening; });
|
it != std::end(matching_pairs))
|
||||||
|
|
||||||
const auto str_beg = reader.str.begin();
|
|
||||||
if (it != std::end(matching_pairs))
|
|
||||||
{
|
{
|
||||||
const Codepoint closing_delimiter = it->closing;
|
const Codepoint closing_delimiter = it->closing;
|
||||||
auto quoted = parse_quoted_balanced(reader, opening_delimiter, closing_delimiter);
|
auto quoted = parse_quoted_balanced(state, opening_delimiter, closing_delimiter);
|
||||||
if (throw_on_unterminated and not quoted.terminated)
|
if (throw_on_unterminated and not quoted.terminated)
|
||||||
{
|
{
|
||||||
auto coord = compute_coord({reader.str.begin(), start});
|
auto coord = compute_coord({state.str.begin(), start});
|
||||||
throw parse_error{format("{}:{}: unterminated string '%{}{}...{}'",
|
throw parse_error{format("{}:{}: unterminated string '%{}{}...{}'",
|
||||||
coord.line+1, coord.column+1, type_name,
|
coord.line+1, coord.column+1, type_name,
|
||||||
opening_delimiter, closing_delimiter)};
|
opening_delimiter, closing_delimiter)};
|
||||||
}
|
}
|
||||||
|
|
||||||
return {type, start - str_beg, std::move(quoted.content), quoted.terminated};
|
return {type, byte_pos, std::move(quoted.content), quoted.terminated};
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto quoted = parse_quoted(reader, opening_delimiter);
|
auto quoted = parse_quoted(state, opening_delimiter);
|
||||||
|
|
||||||
if (throw_on_unterminated and not quoted.terminated)
|
if (throw_on_unterminated and not quoted.terminated)
|
||||||
{
|
{
|
||||||
auto coord = compute_coord({reader.str.begin(), start});
|
auto coord = compute_coord({state.str.begin(), start});
|
||||||
throw parse_error{format("{}:{}: unterminated string '%{}{}...{}'",
|
throw parse_error{format("{}:{}: unterminated string '%{}{}...{}'",
|
||||||
coord.line+1, coord.column+1, type_name,
|
coord.line+1, coord.column+1, type_name,
|
||||||
opening_delimiter, opening_delimiter)};
|
opening_delimiter, opening_delimiter)};
|
||||||
}
|
}
|
||||||
|
|
||||||
return {type, start - str_beg, std::move(quoted.content), quoted.terminated};
|
return {type, byte_pos, std::move(quoted.content), quoted.terminated};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -404,22 +376,22 @@ void expand_token(Token&& token, const Context& context, const ShellContext& she
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CommandParser::CommandParser(StringView command_line) : m_reader{command_line} {}
|
CommandParser::CommandParser(StringView command_line) : m_state{command_line, command_line.begin()} {}
|
||||||
|
|
||||||
Optional<Token> CommandParser::read_token(bool throw_on_unterminated)
|
Optional<Token> CommandParser::read_token(bool throw_on_unterminated)
|
||||||
{
|
{
|
||||||
skip_blanks_and_comments(m_reader);
|
skip_blanks_and_comments(m_state);
|
||||||
if (not m_reader)
|
if (not m_state)
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
const StringView line = m_reader.str;
|
const StringView line = m_state.str;
|
||||||
const char* start = m_reader.pos;
|
const char* start = m_state.pos;
|
||||||
|
|
||||||
const char c = *m_reader.pos;
|
const char c = *m_state.pos;
|
||||||
if (c == '"' or c == '\'')
|
if (c == '"' or c == '\'')
|
||||||
{
|
{
|
||||||
start = m_reader.next_byte().pos;
|
start = ++m_state.pos;
|
||||||
QuotedResult quoted = parse_quoted(m_reader, c);
|
ParseResult quoted = parse_quoted(m_state, c);
|
||||||
if (throw_on_unterminated and not quoted.terminated)
|
if (throw_on_unterminated and not quoted.terminated)
|
||||||
throw parse_error{format("unterminated string {0}...{0}", c)};
|
throw parse_error{format("unterminated string {0}...{0}", c)};
|
||||||
return Token{c == '"' ? Token::Type::RawEval
|
return Token{c == '"' ? Token::Type::RawEval
|
||||||
|
@ -429,25 +401,21 @@ Optional<Token> CommandParser::read_token(bool throw_on_unterminated)
|
||||||
}
|
}
|
||||||
else if (c == '%')
|
else if (c == '%')
|
||||||
{
|
{
|
||||||
auto token = parse_percent_token(m_reader, throw_on_unterminated);
|
++m_state.pos;
|
||||||
return token;
|
return parse_percent_token(m_state, throw_on_unterminated);
|
||||||
}
|
}
|
||||||
else if (is_command_separator(c))
|
else if (is_command_separator(c))
|
||||||
{
|
|
||||||
m_reader.next_byte();
|
|
||||||
return Token{Token::Type::CommandSeparator,
|
return Token{Token::Type::CommandSeparator,
|
||||||
m_reader.pos - line.begin(), {}};
|
++m_state.pos - line.begin(), {}};
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (c == '\\')
|
if (c == '\\' and m_state.pos + 1 != m_state.str.end())
|
||||||
{
|
{
|
||||||
auto next = m_reader.peek_next();
|
const char next = m_state.pos[1];
|
||||||
if (next == '%' or next == '\'' or next == '"')
|
if (next == '%' or next == '\'' or next == '"')
|
||||||
m_reader.next_byte();
|
++m_state.pos;
|
||||||
}
|
}
|
||||||
return Token{Token::Type::Raw, start - line.begin(),
|
return Token{Token::Type::Raw, start - line.begin(), parse_unquoted(m_state)};
|
||||||
parse_unquoted(m_reader)};
|
|
||||||
}
|
}
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
@ -457,32 +425,29 @@ String expand_impl(StringView str, const Context& context,
|
||||||
const ShellContext& shell_context,
|
const ShellContext& shell_context,
|
||||||
Postprocess postprocess)
|
Postprocess postprocess)
|
||||||
{
|
{
|
||||||
Reader reader{str};
|
ParseState state{str, str.begin()};
|
||||||
String res;
|
String res;
|
||||||
auto beg = str.begin();
|
auto beg = state.pos;
|
||||||
while (reader)
|
while (state)
|
||||||
{
|
{
|
||||||
Codepoint c = *reader;
|
if (*state.pos++ == '%')
|
||||||
if (c == '%')
|
|
||||||
{
|
{
|
||||||
if (reader.peek_next() == '%')
|
if (state and *state.pos == '%')
|
||||||
{
|
{
|
||||||
res += (++reader).substr_from(beg);
|
res += StringView{beg, state.pos};
|
||||||
beg = (++reader).pos;
|
beg = ++state.pos;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
res += reader.substr_from(beg);
|
res += StringView{beg, state.pos-1};
|
||||||
String token;
|
String token;
|
||||||
expand_token(parse_percent_token(reader, true), context, shell_context, token);
|
expand_token(parse_percent_token(state, true), context, shell_context, token);
|
||||||
res += postprocess(token);
|
res += postprocess(token);
|
||||||
beg = reader.pos;
|
beg = state.pos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
++reader;
|
|
||||||
}
|
}
|
||||||
res += reader.substr_from(beg);
|
res += StringView{beg, state.pos};
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -826,9 +791,9 @@ UnitTest test_command_parsing{[]
|
||||||
{
|
{
|
||||||
auto check_quoted = [](StringView str, bool terminated, StringView content)
|
auto check_quoted = [](StringView str, bool terminated, StringView content)
|
||||||
{
|
{
|
||||||
Reader reader{str};
|
ParseState state{str, str.begin()};
|
||||||
const Codepoint delimiter = *reader;
|
const Codepoint delimiter = *state.pos++;
|
||||||
auto quoted = parse_quoted(++reader, delimiter);
|
auto quoted = parse_quoted(state, delimiter);
|
||||||
kak_assert(quoted.terminated == terminated);
|
kak_assert(quoted.terminated == terminated);
|
||||||
kak_assert(quoted.content == content);
|
kak_assert(quoted.content == content);
|
||||||
};
|
};
|
||||||
|
@ -838,8 +803,8 @@ UnitTest test_command_parsing{[]
|
||||||
|
|
||||||
auto check_balanced = [](StringView str, Codepoint opening, Codepoint closing, bool terminated, StringView content)
|
auto check_balanced = [](StringView str, Codepoint opening, Codepoint closing, bool terminated, StringView content)
|
||||||
{
|
{
|
||||||
Reader reader{str};
|
ParseState state{str, str.begin()+1};
|
||||||
auto quoted = parse_quoted_balanced(++reader, opening, closing);
|
auto quoted = parse_quoted_balanced(state, opening, closing);
|
||||||
kak_assert(quoted.terminated == terminated);
|
kak_assert(quoted.terminated == terminated);
|
||||||
kak_assert(quoted.content == content);
|
kak_assert(quoted.content == content);
|
||||||
};
|
};
|
||||||
|
@ -849,9 +814,8 @@ UnitTest test_command_parsing{[]
|
||||||
|
|
||||||
auto check_unquoted = [](StringView str, StringView content)
|
auto check_unquoted = [](StringView str, StringView content)
|
||||||
{
|
{
|
||||||
Reader reader{str};
|
ParseState state{str, str.begin()};
|
||||||
auto res = parse_unquoted(reader);
|
kak_assert(parse_unquoted(state) == content);
|
||||||
kak_assert(res == content);
|
|
||||||
};
|
};
|
||||||
check_unquoted("abc def", "abc");
|
check_unquoted("abc def", "abc");
|
||||||
check_unquoted("abc; def", "abc");
|
check_unquoted("abc; def", "abc");
|
||||||
|
|
|
@ -61,21 +61,12 @@ struct Token
|
||||||
bool terminated = false;
|
bool terminated = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Reader
|
struct ParseState
|
||||||
{
|
{
|
||||||
public:
|
|
||||||
Reader(StringView s) : str{s}, pos{s.begin()} {}
|
|
||||||
|
|
||||||
Codepoint operator*() const;
|
|
||||||
Codepoint peek_next() const;
|
|
||||||
Reader& operator++();
|
|
||||||
Reader& next_byte();
|
|
||||||
|
|
||||||
explicit operator bool() const { return pos < str.end(); }
|
|
||||||
StringView substr_from(const char* start) const { return {start, pos}; }
|
|
||||||
|
|
||||||
StringView str;
|
StringView str;
|
||||||
const char* pos;
|
const char* pos;
|
||||||
|
|
||||||
|
operator bool() const { return pos != str.end(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
class CommandParser
|
class CommandParser
|
||||||
|
@ -84,11 +75,11 @@ public:
|
||||||
CommandParser(StringView command_line);
|
CommandParser(StringView command_line);
|
||||||
Optional<Token> read_token(bool throw_on_unterminated);
|
Optional<Token> read_token(bool throw_on_unterminated);
|
||||||
|
|
||||||
const char* pos() const { return m_reader.pos; }
|
const char* pos() const { return m_state.pos; }
|
||||||
bool done() const { return not m_reader; }
|
bool done() const { return not m_state; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Reader m_reader;
|
ParseState m_state;
|
||||||
};
|
};
|
||||||
|
|
||||||
class CommandManager : public Singleton<CommandManager>
|
class CommandManager : public Singleton<CommandManager>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user