diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 4419a77..7a0a96e 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -32,9 +32,15 @@ # The approach is to be Python3 compatible with Python2 "backports". from __future__ import print_function from __future__ import unicode_literals +# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped + +import argparse import codecs +import re import sys +import xml.etree.ElementTree as ET +# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped if sys.version_info[0] <= 2: # Encode stdout as UTF-8, so we can just print to it sys.stdout = codecs.getwriter('utf8')(sys.stdout) @@ -45,12 +51,9 @@ if sys.version_info[0] <= 2: # Python 2 and 3 compatible bytes call def bytes(source, encoding='ascii', errors='strict'): return source.encode(encoding=encoding, errors=errors) +else: # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped - -import re -import argparse -import sys -import xml.etree.ElementTree as ET + sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) # The ranges of Unicode characters that we consider to be "plain letters". # For now we are being conservative by including only Latin and Greek. This @@ -61,8 +64,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA +# Combining marks follow a "base" character, and result in a composite +# character. Example: "U&'A\0300'"produces "À".There are three types of +# combining marks: enclosing (Me), non-spacing combining (Mn), spacing +# combining (Mc). We identify the ranges of marks we feel safe removing. +# References: +# https://en.wikipedia.org/wiki/Combining_character +# https://www.unicode.org/charts/PDF/U0300.pdf +# https://www.unicode.org/charts/PDF/U20D0.pdf +COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA + (0x20dd, 0x20E0), # Me: Symbols + (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle + def print_record(codepoint, letter): - print (chr(codepoint) + "\t" + letter) + if letter: + output = chr(codepoint) + "\t" + letter + else: + output = chr(codepoint) + + print(output) class Codepoint: def __init__(self, id, general_category, combining_ids): @@ -70,6 +90,16 @@ class Codepoint: self.general_category = general_category self.combining_ids = combining_ids +def is_mark_to_remove(codepoint): + """Return true if this is a combining mark to remove.""" + if not is_mark(codepoint): + return False + + for begin, end in COMBINING_MARK_RANGES: + if codepoint.id >= begin and codepoint.id <= end: + return True + return False + def is_plain_letter(codepoint): """Return true if codepoint represents a "plain letter".""" for begin, end in PLAIN_LETTER_RANGES: @@ -206,21 +236,22 @@ def main(args): charactersSet = set() # read file UnicodeData.txt - unicodeDataFile = open(args.unicodeDataFilePath, 'r') - - # read everything we need into memory - for line in unicodeDataFile: - fields = line.split(";") - if len(fields) > 5: - # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt - general_category = fields[2] - decomposition = fields[5] - decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) - id = int(fields[0], 16) - combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] - codepoint = Codepoint(id, general_category, combining_ids) - table[id] = codepoint - all.append(codepoint) + with codecs.open( + args.unicodeDataFilePath, mode='r', encoding='UTF-8', + ) as unicodeDataFile: + # read everything we need into memory + for line in unicodeDataFile: + fields = line.split(";") + if len(fields) > 5: + # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt + general_category = fields[2] + decomposition = fields[5] + decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) + id = int(fields[0], 16) + combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] + codepoint = Codepoint(id, general_category, combining_ids) + table[id] = codepoint + all.append(codepoint) # walk through all the codepoints looking for interesting mappings for codepoint in all: @@ -234,6 +265,8 @@ def main(args): "".join(chr(combining_codepoint.id) for combining_codepoint \ in get_plain_letters(codepoint, table)))) + elif is_mark_to_remove(codepoint): + charactersSet.add((codepoint.id, None)) # add CLDR Latin-ASCII characters if not args.noLigaturesExpansion: diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index c671827..2ae097f 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -9,13 +9,16 @@ SELECT unaccent('foobar'); SELECT unaccent('ёлка'); SELECT unaccent('ЁЖИК'); SELECT unaccent('˃˖˗˜'); +SELECT unaccent('À'); -- Remove combining diacritical 0x0300 SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'ёлка'); SELECT unaccent('unaccent', 'ЁЖИК'); SELECT unaccent('unaccent', '˃˖˗˜'); +SELECT unaccent('unaccent', 'À'); SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'ёлка'); SELECT ts_lexize('unaccent', 'ЁЖИК'); SELECT ts_lexize('unaccent', '˃˖˗˜'); +SELECT ts_lexize('unaccent', 'À'); diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 7ce25ee..9982640 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -414,6 +414,105 @@ ˖ + ˗ - ˜ ~ +̀ +́ +̂ +̃ +̄ +̅ +̆ +̇ +̈ +̉ +̊ +̋ +̌ +̍ +̎ +̏ +̐ +̑ +̒ +̓ +̔ +̕ +̖ +̗ +̘ +̙ +̚ +̛ +̜ +̝ +̞ +̟ +̠ +̡ +̢ +̣ +̤ +̥ +̦ +̧ +̨ +̩ +̪ +̫ +̬ +̭ +̮ +̯ +̰ +̱ +̲ +̳ +̴ +̵ +̶ +̷ +̸ +̹ +̺ +̻ +̼ +̽ +̾ +̿ +̀ +́ +͂ +̓ +̈́ +ͅ +͆ +͇ +͈ +͉ +͊ +͋ +͌ +͍ +͎ +͏ +͐ +͑ +͒ +͓ +͔ +͕ +͖ +͗ +͘ +͙ +͚ +͛ +͜ +͝ +͞ +͟ +͠ +͡ +͢ Ά Α Έ Ε Ή Η @@ -982,6 +1081,13 @@ ₧ Pts ₹ Rs ₺ TL +⃝ +⃞ +⃟ +⃠ +⃢ +⃣ +⃤ ℀ a/c ℁ a/s ℂ C