Avoid iswlower, iswupper, towlower and towupper for ascii codepoints

Avoid the costly shared object function call when most codepoints
will be ascii.

The regex benchmark gets a nice speedup:

Regex                                   Before     After
--------------------------------------+----------+---------
'Twain'                               |    25 ms | 15 ms
'(?i)Twain'                           |    74 ms | 57 ms
'[a-z]shing'                          |   323 ms | 303 ms
'Huck[a-zA-Z]+|Saw[a-zA-Z]+'          |    26 ms | 17 ms
'\b\w+nn\b'                           |   424 ms | 393 ms
'[a-q][^u-z]{13}x'                    |   869 ms | 815 ms
'Tom|Sawyer|Huckleberry|Finn'         |    33 ms | 24 ms
'(?i)Tom|Sawyer|Huckleberry|Finn'     |   319 ms | 281 ms
'.{0,2}(Tom|Sawyer|Huckleberry|Finn)' |  1294 ms | 1293 ms
'.{2,4}(Tom|Sawyer|Huckleberry|Finn)' |  1470 ms | 1429 ms
'Tom.{10,25}river|river.{10,25}Tom'   |    69 ms | 61 ms
'[a-zA-Z]+ing'                        |   447 ms | 408 ms
'\s[a-zA-Z]{0,12}ing\s'               |   539 ms | 543 ms
'([A-Za-z]awyer|[A-Za-z]inn)\s'       |   588 ms | 552 ms
'["'][^"']{0,30}[?!\.]["']'           |    92 ms | 81 ms
This commit is contained in:
Maxime Coste 2024-02-06 22:13:57 +11:00
parent 04a96b059f
commit 3ef68188b4

View File

@ -124,18 +124,18 @@ inline CharCategories categorize(Codepoint c, ConstArrayView<Codepoint> extra_wo
return CharCategories::Punctuation; return CharCategories::Punctuation;
} }
inline Codepoint to_lower(Codepoint cp) noexcept { return towlower((wchar_t)cp); }
inline Codepoint to_upper(Codepoint cp) noexcept { return towupper((wchar_t)cp); }
inline bool is_lower(Codepoint cp) noexcept { return iswlower((wchar_t)cp); }
inline bool is_upper(Codepoint cp) noexcept { return iswupper((wchar_t)cp); }
inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; } inline char to_lower(char c) noexcept { return c >= 'A' and c <= 'Z' ? c - 'A' + 'a' : c; }
inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; } inline char to_upper(char c) noexcept { return c >= 'a' and c <= 'z' ? c - 'a' + 'A' : c; }
inline bool is_lower(char c) noexcept { return c >= 'a' and c <= 'z'; } inline bool is_lower(char c) noexcept { return c >= 'a' and c <= 'z'; }
inline bool is_upper(char c) noexcept { return c >= 'A' and c <= 'Z'; } inline bool is_upper(char c) noexcept { return c >= 'A' and c <= 'Z'; }
inline Codepoint to_lower(Codepoint cp) noexcept { return cp < 128 ? (Codepoint)to_lower((char)cp) : towlower((wchar_t)cp); }
inline Codepoint to_upper(Codepoint cp) noexcept { return cp < 128 ? (Codepoint)to_upper((char)cp) : towupper((wchar_t)cp); }
inline bool is_lower(Codepoint cp) noexcept { return cp < 128 ? is_lower((char)cp) : iswlower((wchar_t)cp); }
inline bool is_upper(Codepoint cp) noexcept { return cp < 128 ? is_upper((char)cp) : iswupper((wchar_t)cp); }
} }
#endif // unicode_hh_INCLUDED #endif // unicode_hh_INCLUDED