From 9dfab2f1fb383478db4116a1cf63d81e1c53bd0d Mon Sep 17 00:00:00 2001 From: Jacob Collins Date: Wed, 3 Feb 2021 00:04:05 -0500 Subject: [PATCH] Follow ECMA specification for regex whitespace Changes the behaviour of the \s and \h character classes to include all WhiteSpace and LineTerminator characters defined in the ECMA specification. - - - Fixes #4034 --- src/unicode.hh | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/unicode.hh b/src/unicode.hh index 486fc381..a385b6eb 100644 --- a/src/unicode.hh +++ b/src/unicode.hh @@ -20,12 +20,44 @@ inline bool is_eol(Codepoint c) noexcept inline bool is_horizontal_blank(Codepoint c) noexcept { - return c == ' ' or c == '\t'; + // Characters considered whitespace by ECMA Regex Spec + // minus vertical tab + // + return c == '\t' or + c == '\f' or + c == ' ' or + c == U'\u00A0' or + c == U'\uFEFF' or + c == U'\u1680' or + c == U'\u2000' or + c == U'\u2001' or + c == U'\u2002' or + c == U'\u2003' or + c == U'\u2004' or + c == U'\u2005' or + c == U'\u2006' or + c == U'\u2007' or + c == U'\u2008' or + c == U'\u2009' or + c == U'\u200A' or + c == U'\u2028' or + c == U'\u2029' or + c == U'\u202F' or + c == U'\u205F' or + c == U'\u3000' ; } inline bool is_blank(Codepoint c) noexcept { - return c == ' ' or c == '\t' or c == '\n'; + // Characters considered Line Terminators by ECMA Regex Spec + // plus vertical tab + // + return c == '\n' or + c == '\r' or + c == '\v' or + c == U'\u2028' or + c == U'\u2029' or + is_horizontal_blank(c) ; } enum WordType { Word, WORD };