From 039ddd985d24a2efccbf9fc34bc5ef36a3cfd9bb Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 17 Dec 2021 12:55:34 -0400 Subject: [PATCH v2 1/3] Move the implementation of pg_utf_mblen() to an inline function Use that to specialize pg_mblen() for UTF-8. This provides a modest speedup for code that calls pg_mblen() in a loop. This has a side effect of removing the unnecessary check for zero bytes in pg_utf8_verifychar(). WIP: Maybe "fast" in the name is misleading -- the point is to be inlinable. --- src/backend/utils/mb/mbutils.c | 6 ++++- src/common/wchar.c | 45 ++-------------------------------- src/include/mb/pg_wchar.h | 33 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 44 deletions(-) diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index a13c398f4a..91eea625b9 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -965,7 +965,11 @@ pg_encoding_wchar2mb_with_len(int encoding, int pg_mblen(const char *mbstr) { - return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + /* avoid the overhead of a function call for UTF-8 */ + if (GetDatabaseEncoding() == PG_UTF8) + return pg_utf_mblen_fast((const unsigned char *) mbstr); + else + return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); } /* returns the display length of a multibyte character */ diff --git a/src/common/wchar.c b/src/common/wchar.c index a6bffd0642..5fd682829c 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -536,37 +536,11 @@ pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len) /* * Return the byte length of a UTF8 character pointed to by s - * - * Note: in the current implementation we do not support UTF8 sequences - * of more than 4 bytes; hence do NOT return a value larger than 4. - * We return "1" for any leading byte that is either flat-out illegal or - * indicates a length larger than we support. - * - * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps - * other places would need to be fixed to change this. */ int pg_utf_mblen(const unsigned char *s) { - int len; - - if ((*s & 0x80) == 0) - len = 1; - else if ((*s & 0xe0) == 0xc0) - len = 2; - else if ((*s & 0xf0) == 0xe0) - len = 3; - else if ((*s & 0xf8) == 0xf0) - len = 4; -#ifdef NOT_USED - else if ((*s & 0xfc) == 0xf8) - len = 5; - else if ((*s & 0xfe) == 0xfc) - len = 6; -#endif - else - len = 1; - return len; + return pg_utf_mblen_fast(s); } /* @@ -1724,22 +1698,7 @@ pg_gb18030_verifystr(const unsigned char *s, int len) static int pg_utf8_verifychar(const unsigned char *s, int len) { - int l; - - if ((*s & 0x80) == 0) - { - if (*s == '\0') - return -1; - return 1; - } - else if ((*s & 0xe0) == 0xc0) - l = 2; - else if ((*s & 0xf0) == 0xe0) - l = 3; - else if ((*s & 0xf8) == 0xf0) - l = 4; - else - l = 1; + int l = pg_utf_mblen_fast(s); if (l > len) return -1; diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index d93ccac263..a8d67c1214 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -590,6 +590,39 @@ extern bool pg_utf8_islegal(const unsigned char *source, int length); extern int pg_utf_mblen(const unsigned char *s); extern int pg_mule_mblen(const unsigned char *s); +/* + * Return the byte length of a UTF8 character pointed to by s + * Workhorse for pg_utf_mblen(). + * + * Declared as inline for callers of pg_mblen() that are performance critical + * enough to justify specializing for UTF-8. + * + * Note: in the current implementation we do not support UTF8 sequences + * of more than 4 bytes; hence do NOT return a value larger than 4. + * We return "1" for any leading byte that is either flat-out illegal or + * indicates a length larger than we support. + * + * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps + * other places would need to be fixed to change this. + */ +static inline int +pg_utf_mblen_fast(const unsigned char *s) +{ + int len; + + if ((*s & 0x80) == 0) + len = 1; + else if ((*s & 0xe0) == 0xc0) + len = 2; + else if ((*s & 0xf0) == 0xe0) + len = 3; + else if ((*s & 0xf8) == 0xf0) + len = 4; + else + len = 1; + return len; +} + /* * The remaining functions are backend-only. */ -- 2.31.1