From 0fc7e138b072c358450e53eb10dc6fe7b5f9cb23 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 20 Aug 2021 12:23:50 -0400 Subject: [PATCH v2 3/4] Fix display width of emoji and other codepoints The hardcoded "wide character" set at the end of ucs_wcwidth() was last touched around the Unicode 5.0 era. This led to misalignment on modern platforms when printing emoji and other codepoints that have since been designated wide/fullwidth. To fix, extend update-unicode to get the correct widths from the EastAsianWidth.txt file. Jacob Champion, with some adjustments by me Reported and reviewed by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com --- src/common/unicode/.gitignore | 1 + src/common/unicode/Makefile | 8 +- .../unicode/generate-unicode_width_table.pl | 99 +++++++++++++-- src/common/wchar.c | 23 +--- src/include/common/unicode_width_table.h | 116 ++++++++++++++++++ 5 files changed, 214 insertions(+), 33 deletions(-) diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore index 512862e538..46243f701d 100644 --- a/src/common/unicode/.gitignore +++ b/src/common/unicode/.gitignore @@ -4,5 +4,6 @@ # Downloaded files /CompositionExclusions.txt /DerivedNormalizationProps.txt +/EastAsianWidth.txt /NormalizationTest.txt /UnicodeData.txt diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index 499e31d59f..5ddf7d0cb7 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -25,7 +25,7 @@ update-unicode: unicode_norm_table.h unicode_width_table.h unicode_normprops_tab # These files are part of the Unicode Character Database. Download # them on demand. The dependency on Makefile.global is for # UNICODE_VERSION. -UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global +UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) # Generation of conversion tables used for string normalization with @@ -35,8 +35,8 @@ unicode_norm_hashfunc.h: unicode_norm_table.h unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt $(PERL) generate-unicode_norm_table.pl -unicode_width_table.h: generate-unicode_width_table.pl UnicodeData.txt - $(PERL) $^ >$@ +unicode_width_table.h: generate-unicode_width_table.pl UnicodeData.txt EastAsianWidth.txt + $(PERL) generate-unicode_width_table.pl >$@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt $(PERL) $^ >$@ @@ -64,6 +64,6 @@ clean: rm -f $(OBJS) norm_test norm_test.o distclean: clean - rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h + rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h maintainer-clean: distclean diff --git a/src/common/unicode/generate-unicode_width_table.pl b/src/common/unicode/generate-unicode_width_table.pl index 0cf44b029c..1f8ddc6331 100644 --- a/src/common/unicode/generate-unicode_width_table.pl +++ b/src/common/unicode/generate-unicode_width_table.pl @@ -1,25 +1,30 @@ #!/usr/bin/perl # -# Generate sorted list of non-overlapping intervals of non-spacing -# characters, using Unicode data files as input. Pass UnicodeData.txt -# as argument. The output is on stdout. +# Generate sorted list of non-overlapping intervals of characters and +# and their display widths, using Unicode data files as input. +# The output is on stdout. # # Copyright (c) 2019-2021, PostgreSQL Global Development Group use strict; use warnings; -my $range_start = undef; -my $codepoint; -my $prev_codepoint; -my $count = 0; +my $UD; +my $EAW; +my @ranges; print "/* generated by src/common/unicode/generate-unicode_width_table.pl, do not edit */\n\n"; -print "static const struct mbinterval wcwidth[] = {\n"; +# First get the combining characters (width = 0) +my $range_start = undef; +my $codepoint; +my $prev_codepoint; + +open($UD, '<', "UnicodeData.txt") + or die "Could not open UnicodeData.txt: $!."; -foreach my $line () +foreach my $line (<$UD>) { chomp $line; my @fields = split ';', $line; @@ -40,7 +45,7 @@ foreach my $line () # not a combining character, print out previous range if any if (defined($range_start)) { - printf "\t{0x%04X, 0x%04X, 0},\n", $range_start, $prev_codepoint; + push @ranges, {first => $range_start, last => $prev_codepoint, width => 0}; $range_start = undef; } } @@ -50,4 +55,78 @@ continue $prev_codepoint = $codepoint; } +# Now get the East Asian Wide (W) and East Asian Fullwidth (F) characters (width = 2) +$range_start = undef; +my ($first, $last); +my $prev_last; + +open($EAW, '<', "EastAsianWidth.txt") + or die "Could not open EastAsianWidth.txt: $!."; + +foreach my $line (<$EAW>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $width) = split ';', $line; + + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + ($first, $last) = map(hex, ($first, $last)); + + if ($width eq 'F' || $width eq 'W') + { + # fullwidth/wide characters + if (!defined($range_start)) + { + # save for start of range if one hasn't been started yet + $range_start = $first; + } + elsif ($first != $prev_last + 1) + { + # ranges aren't contiguous; emit the last and start a new one + push @ranges, {first => $range_start, last => $prev_last, width => 2}; + $range_start = $first; + } + } + else + { + # not wide characters, print out previous range if any + if (defined($range_start)) + { + push @ranges, {first => $range_start, last => $prev_last, width => 2}; + $range_start = undef; + } + } +} +continue +{ + $prev_last = $last; +} + +# don't forget any ranges at the very end of the database (though there are none +# as of Unicode 13.0) +if (defined($range_start)) +{ + push @ranges, {first => $range_start, last => $prev_last, width => 2}; +} + +close $UD; +close $EAW; + +# emit the sorted ranges with their widths +print "static const struct mbinterval wcwidth[] = {\n"; + +foreach my $range (sort {$a->{first} <=> $b->{first}} @ranges) +{ + printf "\t{0x%04X, 0x%04X, %d},\n", $range->{first}, $range->{last}, $range->{width}; +} + print "};\n"; diff --git a/src/common/wchar.c b/src/common/wchar.c index c0397ca139..e7c3f5dd09 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -583,9 +583,9 @@ pg_utf_mblen(const unsigned char *s) struct mbinterval { - unsigned short first; - unsigned short last; - signed short width; + unsigned int first; + unsigned int last:21; + signed int width:4; }; /* auxiliary function for binary search in interval table */ @@ -662,22 +662,7 @@ ucs_wcwidth(pg_wchar ucs) if (range != NULL) return range->width; - /* - * if we arrive here, ucs is not a combining or C0/C1 control character - */ - - return 1 + - (ucs >= 0x1100 && - (ucs <= 0x115f || /* Hangul Jamo init. consonants */ - (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a && - ucs != 0x303f) || /* CJK ... Yi */ - (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ - (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility - * Ideographs */ - (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ - (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */ - (ucs >= 0xffe0 && ucs <= 0xffe6) || - (ucs >= 0x20000 && ucs <= 0x2ffff))); + return 1; } /* diff --git a/src/include/common/unicode_width_table.h b/src/include/common/unicode_width_table.h index 3b161f47a4..5b7b6e15e2 100644 --- a/src/include/common/unicode_width_table.h +++ b/src/include/common/unicode_width_table.h @@ -102,6 +102,7 @@ static const struct mbinterval wcwidth[] = { {0x1085, 0x1086, 0}, {0x108D, 0x108D, 0}, {0x109D, 0x109D, 0}, + {0x1100, 0x115F, 2}, {0x135D, 0x135F, 0}, {0x1712, 0x1714, 0}, {0x1732, 0x1734, 0}, @@ -150,11 +151,60 @@ static const struct mbinterval wcwidth[] = { {0x1CF8, 0x1CF9, 0}, {0x1DC0, 0x1DFF, 0}, {0x20D0, 0x20F0, 0}, + {0x231A, 0x231B, 2}, + {0x2329, 0x232A, 2}, + {0x23E9, 0x23EC, 2}, + {0x23F0, 0x23F0, 2}, + {0x23F3, 0x23F3, 2}, + {0x25FD, 0x25FE, 2}, + {0x2614, 0x2615, 2}, + {0x2648, 0x2653, 2}, + {0x267F, 0x267F, 2}, + {0x2693, 0x2693, 2}, + {0x26A1, 0x26A1, 2}, + {0x26AA, 0x26AB, 2}, + {0x26BD, 0x26BE, 2}, + {0x26C4, 0x26C5, 2}, + {0x26CE, 0x26CE, 2}, + {0x26D4, 0x26D4, 2}, + {0x26EA, 0x26EA, 2}, + {0x26F2, 0x26F3, 2}, + {0x26F5, 0x26F5, 2}, + {0x26FA, 0x26FA, 2}, + {0x26FD, 0x26FD, 2}, + {0x2705, 0x2705, 2}, + {0x270A, 0x270B, 2}, + {0x2728, 0x2728, 2}, + {0x274C, 0x274C, 2}, + {0x274E, 0x274E, 2}, + {0x2753, 0x2755, 2}, + {0x2757, 0x2757, 2}, + {0x2795, 0x2797, 2}, + {0x27B0, 0x27B0, 2}, + {0x27BF, 0x27BF, 2}, + {0x2B1B, 0x2B1C, 2}, + {0x2B50, 0x2B50, 2}, + {0x2B55, 0x2B55, 2}, {0x2CEF, 0x2CF1, 0}, {0x2D7F, 0x2D7F, 0}, {0x2DE0, 0x2DFF, 0}, + {0x2E80, 0x2E99, 2}, + {0x2E9B, 0x2EF3, 2}, + {0x2F00, 0x2FD5, 2}, + {0x2FF0, 0x2FFB, 2}, + {0x3000, 0x303E, 2}, {0x302A, 0x302D, 0}, + {0x3041, 0x3096, 2}, {0x3099, 0x309A, 0}, + {0x3099, 0x30FF, 2}, + {0x3105, 0x312F, 2}, + {0x3131, 0x318E, 2}, + {0x3190, 0x31E3, 2}, + {0x31F0, 0x321E, 2}, + {0x3220, 0x3247, 2}, + {0x3250, 0x4DBF, 2}, + {0x4E00, 0xA48C, 2}, + {0xA490, 0xA4C6, 2}, {0xA66F, 0xA672, 0}, {0xA674, 0xA67D, 0}, {0xA69E, 0xA69F, 0}, @@ -169,6 +219,7 @@ static const struct mbinterval wcwidth[] = { {0xA8FF, 0xA8FF, 0}, {0xA926, 0xA92D, 0}, {0xA947, 0xA951, 0}, + {0xA960, 0xA97C, 2}, {0xA980, 0xA982, 0}, {0xA9B3, 0xA9B3, 0}, {0xA9B6, 0xA9B9, 0}, @@ -190,7 +241,72 @@ static const struct mbinterval wcwidth[] = { {0xABE5, 0xABE5, 0}, {0xABE8, 0xABE8, 0}, {0xABED, 0xABED, 0}, + {0xAC00, 0xD7A3, 2}, + {0xF900, 0xFAFF, 2}, {0xFB1E, 0xFB1E, 0}, {0xFE00, 0xFE0F, 0}, + {0xFE10, 0xFE19, 2}, {0xFE20, 0xFE2F, 0}, + {0xFE30, 0xFE52, 2}, + {0xFE54, 0xFE66, 2}, + {0xFE68, 0xFE6B, 2}, + {0xFF01, 0xFF60, 2}, + {0xFFE0, 0xFFE6, 2}, + {0x16FE0, 0x16FE4, 2}, + {0x16FF0, 0x16FF1, 2}, + {0x17000, 0x187F7, 2}, + {0x18800, 0x18CD5, 2}, + {0x18D00, 0x18D08, 2}, + {0x1B000, 0x1B11E, 2}, + {0x1B150, 0x1B152, 2}, + {0x1B164, 0x1B167, 2}, + {0x1B170, 0x1B2FB, 2}, + {0x1F004, 0x1F004, 2}, + {0x1F0CF, 0x1F0CF, 2}, + {0x1F18E, 0x1F18E, 2}, + {0x1F191, 0x1F19A, 2}, + {0x1F200, 0x1F202, 2}, + {0x1F210, 0x1F23B, 2}, + {0x1F240, 0x1F248, 2}, + {0x1F250, 0x1F251, 2}, + {0x1F260, 0x1F265, 2}, + {0x1F300, 0x1F320, 2}, + {0x1F32D, 0x1F335, 2}, + {0x1F337, 0x1F37C, 2}, + {0x1F37E, 0x1F393, 2}, + {0x1F3A0, 0x1F3CA, 2}, + {0x1F3CF, 0x1F3D3, 2}, + {0x1F3E0, 0x1F3F0, 2}, + {0x1F3F4, 0x1F3F4, 2}, + {0x1F3F8, 0x1F43E, 2}, + {0x1F440, 0x1F440, 2}, + {0x1F442, 0x1F4FC, 2}, + {0x1F4FF, 0x1F53D, 2}, + {0x1F54B, 0x1F54E, 2}, + {0x1F550, 0x1F567, 2}, + {0x1F57A, 0x1F57A, 2}, + {0x1F595, 0x1F596, 2}, + {0x1F5A4, 0x1F5A4, 2}, + {0x1F5FB, 0x1F64F, 2}, + {0x1F680, 0x1F6C5, 2}, + {0x1F6CC, 0x1F6CC, 2}, + {0x1F6D0, 0x1F6D2, 2}, + {0x1F6D5, 0x1F6D7, 2}, + {0x1F6EB, 0x1F6EC, 2}, + {0x1F6F4, 0x1F6FC, 2}, + {0x1F7E0, 0x1F7EB, 2}, + {0x1F90C, 0x1F93A, 2}, + {0x1F93C, 0x1F945, 2}, + {0x1F947, 0x1F978, 2}, + {0x1F97A, 0x1F9CB, 2}, + {0x1F9CD, 0x1F9FF, 2}, + {0x1FA70, 0x1FA74, 2}, + {0x1FA78, 0x1FA7A, 2}, + {0x1FA80, 0x1FA86, 2}, + {0x1FA90, 0x1FAA8, 2}, + {0x1FAB0, 0x1FAB6, 2}, + {0x1FAC0, 0x1FAC2, 2}, + {0x1FAD0, 0x1FAD6, 2}, + {0x20000, 0x2FFFD, 2}, + {0x30000, 0x3FFFD, 2}, }; -- 2.31.1