patch: support unicode characters above U+10000 - Mailing list pgsql-jdbc
From | Oliver Jowett |
---|---|
Subject | patch: support unicode characters above U+10000 |
Date | |
Msg-id | 4116B439.60207@opencloud.com Whole thread Raw |
List | pgsql-jdbc |
This patch adds support for translating UTF-8 representations of unicode characters above U+10000 into UTF-16 surrogate pairs. Once the server supports these characters (see recent discussion on -hackers), the driver should be able to process them without problems (in theory..). This translation behaviour is the same as what (at least) 1.4 does when decoding UTF-8 via a String ctor. To actually handle the resulting surrogate pairs properly throughout the system you need a 1.5 JDK. See http://java.sun.com/developer/technicalArticles/Intl/Supplementary/ for some background. I also added checks for illegal encodings in the decoder, and added more testcases for the decoder since I've broken it once before.. Along the way I did some microbenchmarking of the decoder against 1.4.2 client and server JVMs. It's still substantially faster to use our own decoder here rather than use the String ctor (factor of 2 difference). The new checks for illegal encodings add about a 10-15% overhead. -O Index: org/postgresql/core/Encoding.java =================================================================== RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/core/Encoding.java,v retrieving revision 1.16 diff -u -c -r1.16 Encoding.java *** org/postgresql/core/Encoding.java 17 Jul 2004 07:39:41 -0000 1.16 --- org/postgresql/core/Encoding.java 8 Aug 2004 23:00:50 -0000 *************** *** 261,268 **** /** * Custom byte[] -> String conversion routine for UTF-8 only. ! * This is about 30% faster than using the String(byte[],int,int,String) ! * ctor, at least under JDK 1.4.2. * * @param data the array containing UTF8-encoded data * @param offset the offset of the first byte in <code>data</code> to decode from --- 261,270 ---- /** * Custom byte[] -> String conversion routine for UTF-8 only. ! * This is about twice as fast as using the String(byte[],int,int,String) ! * ctor, at least under JDK 1.4.2. The extra checks for illegal representations ! * add about 10-15% overhead but seem worth it given the number of SQL_ASCII ! * databases out there.. * * @param data the array containing UTF8-encoded data * @param offset the offset of the first byte in <code>data</code> to decode from *************** *** 270,276 **** * @return a decoded string * @throws IOException if something goes wrong */ ! private synchronized String decodeUTF8(byte[] data, int offset, int length) throws IOException { char[] cdata = decoderArray; if (cdata.length < length) cdata = decoderArray = new char[length]; --- 272,278 ---- * @return a decoded string * @throws IOException if something goes wrong */ ! public synchronized String decodeUTF8(byte[] data, int offset, int length) throws IOException { char[] cdata = decoderArray; if (cdata.length < length) cdata = decoderArray = new char[length]; *************** *** 282,309 **** try { while (in < end) { int ch = data[in++] & 0xff; if (ch < 0x80) { ! // Length 1: \u00000 .. \u0007f } else if (ch < 0xe0) { ! // Length 2: \u00080 .. \u007ff ch = ((ch & 0x1f) << 6); ch = ch | (data[in++] & 0x3f); ! } else { ! // Length 3: \u00800 .. \u0ffff ch = ((ch & 0x0f) << 12); ch = ch | ((data[in++] & 0x3f) << 6); ch = ch | (data[in++] & 0x3f); } - cdata[out++] = (char)ch; } } catch (ArrayIndexOutOfBoundsException a) { ! throw new IOException("UTF-8 string representation was truncated"); } ! // Check if we ran past the end without seeing an exception. if (in > end) ! throw new IOException("UTF-8 string representation was truncated"); ! return new String(cdata, 0, out); } --- 284,389 ---- try { while (in < end) { int ch = data[in++] & 0xff; + + // Convert UTF-8 to 31-bit codepoint. if (ch < 0x80) { ! // 0xxxxxxx -- length 1. ! } else if (ch < 0xc0) { ! // 10xxxxxx -- illegal! ! throw new IOException("Illegal UTF-8 input (initial byte is 10xxxxxx)"); } else if (ch < 0xe0) { ! // 110xxxxx 10xxxxxx ch = ((ch & 0x1f) << 6); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 2 of 2 not 10xxxxxx)"); ch = ch | (data[in++] & 0x3f); ! } else if (ch < 0xf0) { ! // 1110xxxx 10xxxxxx 10xxxxxx ch = ((ch & 0x0f) << 12); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 2 of 3 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 6); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 3 of 3 not 10xxxxxx)"); + ch = ch | (data[in++] & 0x3f); + } else if (ch < 0xf8) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + ch = ((ch & 0x07) << 18); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 2 of 4 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 12); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 3 of 4 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 6); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 4 of 4 not 10xxxxxx)"); + ch = ch | (data[in++] & 0x3f); + } else if (ch < 0xfc) { + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // nb: should never happen in theory, but might as well accept it anyway -- + // perhaps something is generating non-minimal UTF-8 output. + ch = ((ch & 0x03) << 24); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 2 of 5 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 18); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 3 of 5 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 12); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 4 of 5 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 6); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 5 of 5 not 10xxxxxx)"); + ch = ch | (data[in++] & 0x3f); + } else if (ch < 0xfe) { + // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // nb: should never happen in theory, but might as well accept it anyway -- + // perhaps something is generating non-minimal UTF-8 output. + ch = ((ch & 0x01) << 30); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 2 of 6 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 24); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 3 of 6 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 18); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 4 of 6 not 10xxxxxx)"); + ch = ch | ((data[in++] & 0x3f) << 12); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 5 of 6 not 10xxxxxx)"); ch = ch | ((data[in++] & 0x3f) << 6); + if ((data[in] & 0xc0) != 0x80) + throw new IOException("Illegal UTF-8 input (byte 6 of 6 not 10xxxxxx)"); ch = ch | (data[in++] & 0x3f); + } else { + throw new IOException("Illegal UTF-8 input (initial byte is 1111111x)"); + } + + // Convert 31-bit codepoint to UTF-16 + if (ch > 0x10ffff) + throw new IOException("Illegal UTF-8 input (final value out of range: " + ch + ")"); + + if (ch > 0xffff) { + // Use a surrogate pair to represent it. + ch -= 0x10000; // ch is now 0..fffff (20 bits) + cdata[out++] = (char) (0xd800 + (ch >> 10)); // top 10 bits + cdata[out++] = (char) (0xdc00 + (ch & 0x3ff)); // bottom 10 bits + } else if (ch >= 0xd800 && ch < 0xe000) { + // Not allowed to encode the surrogate range directly. + throw new IOException("Illegal UTF-8 input (final value is a surrogate value: " + ch + ")"); + } else { + // Normal case. + cdata[out++] = (char) ch; } } } catch (ArrayIndexOutOfBoundsException a) { ! throw new IOException("UTF-8 input was truncated"); } ! // Check if we ran past the end without seeing an exception. if (in > end) ! throw new IOException("UTF-8 input was truncated"); ! return new String(cdata, 0, out); } Index: org/postgresql/test/jdbc2/DatabaseEncodingTest.java =================================================================== RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/test/jdbc2/DatabaseEncodingTest.java,v retrieving revision 1.2 diff -u -c -r1.2 DatabaseEncodingTest.java *** org/postgresql/test/jdbc2/DatabaseEncodingTest.java 27 Jul 2004 05:03:04 -0000 1.2 --- org/postgresql/test/jdbc2/DatabaseEncodingTest.java 8 Aug 2004 23:00:50 -0000 *************** *** 1,16 **** package org.postgresql.test.jdbc2; import org.postgresql.test.TestUtil; import junit.framework.TestCase; import java.sql.*; /* ! * Test case for Dario's encoding problems. ! * Ensure the driver's own utf-8 decode method works. */ public class DatabaseEncodingTest extends TestCase { private Connection con; public DatabaseEncodingTest(String name) { --- 1,23 ---- package org.postgresql.test.jdbc2; import org.postgresql.test.TestUtil; + import org.postgresql.core.Encoding; + import java.io.IOException; + import java.util.Arrays; import junit.framework.TestCase; import java.sql.*; /* ! * Test case for various encoding problems. ! * ! * Ensure that we can do a round-trip of all server-supported unicode ! * values without trashing them, and that bad character encodings are ! * detected. */ public class DatabaseEncodingTest extends TestCase { private Connection con; + private final Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8"); public DatabaseEncodingTest(String name) { *************** *** 66,73 **** rs.close(); // Create data. ! // NB: we only test up to d800 as code points above that are ! // reserved for surrogates in UTF-16 PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES(?,?)"); for (int i = 1; i < 0xd800; i += STEP) { int count = (i+STEP) > 0xd800 ? 0xd800-i : STEP; --- 73,81 ---- rs.close(); // Create data. ! // NB: we avoid d800-dfff as that range is reserved for surrogates in UTF-16. ! // We also do not test codepoints above U+10000 as the server doesn't correctly ! // support them (yet). PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES(?,?)"); for (int i = 1; i < 0xd800; i += STEP) { int count = (i+STEP) > 0xd800 ? 0xd800-i : STEP; *************** *** 82,87 **** --- 90,108 ---- assertEquals(1, insert.executeUpdate()); } + for (int i = 0xe000; i < 0x10000; i += STEP) { + int count = (i+STEP) > 0x10000 ? 0x10000-i : STEP; + char[] testChars = new char[count]; + for (int j = 0; j < count; ++j) + testChars[j] = (char)(i+j); + + String testString = new String(testChars); + + insert.setInt(1, i); + insert.setString(2, testString); + assertEquals(1, insert.executeUpdate()); + } + con.commit(); // Check data. *************** *** 99,103 **** --- 120,267 ---- assertEquals(dumpString(testString), dumpString(rs.getString(2))); } + + for (int i = 0xe000; i < 0x10000; i += STEP) { + assertTrue(rs.next()); + assertEquals(i, rs.getInt(1)); + + int count = (i+STEP) > 0x10000 ? 0x10000-i : STEP; + char[] testChars = new char[count]; + for (int j = 0; j < count; ++j) + testChars[j] = (char)(i+j); + + String testString = new String(testChars); + + assertEquals(dumpString(testString), dumpString(rs.getString(2))); + } + } + + public void testUTF8Decode() throws Exception { + // Tests for our custom UTF-8 decoder. + + for (int ch = 0; ch < 0x110000; ++ch) { + if (ch >= 0xd800 && ch < 0xe000) + continue; // Surrogate range. + + String testString; + if (ch >= 0x10000) { + testString = new String(new char[] { + (char) (0xd800 + ((ch-0x10000) >> 10)), + (char) (0xdc00 + ((ch-0x10000) & 0x3ff)) }); + } else { + testString = new String(new char[] { (char)ch }); + } + + byte[] jvmEncoding = testString.getBytes("UTF-8"); + String jvmDecoding = new String(jvmEncoding, 0, jvmEncoding.length, "UTF-8"); + String ourDecoding = utf8Encoding.decode(jvmEncoding, 0, jvmEncoding.length); + + assertEquals(testString, jvmDecoding); + assertEquals(testString, ourDecoding); + } + } + + public void testBadUTF8Decode() throws Exception { + byte[][] badSequences = new byte[][] { + // One-byte illegal sequences + { (byte)0x80 }, // First byte may not be 10xxxxxx + + // Two-byte illegal sequences + { (byte)0xc0, (byte)0x00 }, // Second byte must be 10xxxxxx + + // Three-byte illegal sequences + { (byte)0xe0, (byte)0x00 }, // Second byte must be 10xxxxxx + { (byte)0xe0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx + { (byte)0xed, (byte)0xa0, (byte)0x80 }, // Not allowed to encode the range d800..dfff + + // Four-byte illegal sequences + { (byte)0xf0, (byte)0x00 }, // Second byte must be 10xxxxxx + { (byte)0xf0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx + { (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx + + // Five-byte illegal sequences + { (byte)0xf8, (byte)0x00 }, // Second byte must be 10xxxxxx + { (byte)0xf8, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx + { (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx + { (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fifth byte must be 10xxxxxx + { (byte)0xf8, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80 }, // Resulting value must be < U+110000 + + // Six-byte illegal sequences + { (byte)0xfc, (byte)0x00 }, // Second byte must be 10xxxxxx + { (byte)0xfc, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx + { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx + { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fifth byte must be 10xxxxxx + { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 }, // Sixth byte must be 10xxxxxx + { (byte)0xfc, (byte)0x80, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80 }, // Resulting value must be < U+110000 + + // Seven-byte illegal sequences + { (byte)0xfe }, // Can't have a seven-byte sequence. + + // Eigth-byte illegal sequences + { (byte)0xff }, // Can't have an eight-byte sequence. + }; + + byte[] paddedSequence = new byte[32]; + for (int i = 0; i < badSequences.length; ++i) { + byte[] sequence = badSequences[i]; + + try { + String str = utf8Encoding.decode(sequence, 0, sequence.length); + fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); + } catch (IOException ioe) {} + + // Try it with padding. + Arrays.fill(paddedSequence, (byte)0); + System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length); + + try { + String str = utf8Encoding.decode(paddedSequence, 0, paddedSequence.length); + fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); + } catch (IOException ioe) {} + } + } + + public void testTruncatedUTF8Decode() throws Exception { + byte[][] shortSequences = new byte[][] { + { (byte)0xc0 }, // Second byte must be present + + { (byte)0xe0 }, // Second byte must be present + { (byte)0xe0, (byte)0x80 }, // Third byte must be present + + { (byte)0xf0 }, // Second byte must be present + { (byte)0xf0, (byte)0x80 }, // Third byte must be present + { (byte)0xf0, (byte)0x80, (byte)0x80 }, // Fourth byte must be present + + { (byte)0xfc }, // Second byte must be present + { (byte)0xfc, (byte)0x80 }, // Third byte must be present + { (byte)0xfc, (byte)0x80, (byte)0x80 }, // Fourth byte must be present + { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80 }, // Fifth byte must be present + { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // Sixth byte must be present + + { (byte)0xf8 }, // Second byte must be present + { (byte)0xf8, (byte)0x80 }, // Third byte must be present + { (byte)0xf8, (byte)0x80, (byte)0x80 }, // Fourth byte must be present + { (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x80 }, // Fifth byte must be present + }; + + byte[] paddedSequence = new byte[32]; + for (int i = 0; i < shortSequences.length; ++i) { + byte[] sequence = shortSequences[i]; + + try { + String str = utf8Encoding.decode(sequence, 0, sequence.length); + fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); + } catch (IOException ioe) {} + + + // Try it with padding and a truncated length. + Arrays.fill(paddedSequence, (byte)0); + System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length); + + try { + String str = utf8Encoding.decode(paddedSequence, 0, sequence.length); + fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); + } catch (IOException ioe) {} + } } }
pgsql-jdbc by date: