From f525c1b14eac7bbdb6c56fd6c40ba1048ea7f9ce Mon Sep 17 00:00:00 2001 From: Aleksander Alekseev Date: Tue, 12 Apr 2022 15:58:11 +0300 Subject: [PATCH v3] Compression dictionaries for JSONB Usage example: CREATE TYPE mydict AS DITIONARY OF jsonb ('aaa', 'bbb', ...); SELECT ('{"aaa":"bbb"}' :: mydict) -> 'aaa'; The created type works as a drop-in replacement for JSONB. However, its internal representation differs. The provided dictionary entries ('aaa', 'bbb', ..) are stored in pg_dict catalog table. When `mydict` sees one of the entries in the document, it replaces it with the corresponding pg_dict entry Oid. For more details regarding the compression algorithm and choosen compromises please see the comments in the code. In pg_type `mydict` has typtype = TYPTYPE_DICT. It works the same way as TYPTYPE_BASE with only difference: corresponding `_in` (pg_type.typinput) and `_` (pg_cast.castfunc) procedures receive the dictionary Oid as a `typmod` argument. This way the procedures can distinguish `mydict1` from `mydict2` and use the proper compression dictionary. (NOTE: dear committer, please bump the catalog version!) Author: Aleksander Alekseev Reviewed-by: FIXME Discussion: https://postgr.es/m/CAJ7c6TOtAB0z1UrksvGTStNE-herK-43bj22%3D5xVBg7S4vr5rQ%40mail.gmail.com Discussion: https://postgr.es/m/CAJ7c6TPx7N-bVw0dZ1ASCDQKZJHhBYkT6w4HV1LzfS%2BUUTUfmA%40mail.gmail.com --- src/backend/catalog/Makefile | 3 +- src/backend/catalog/pg_dict.c | 175 +++++++ src/backend/commands/typecmds.c | 160 +++++- src/backend/executor/functions.c | 1 + src/backend/nodes/copyfuncs.c | 15 + src/backend/nodes/equalfuncs.c | 13 + src/backend/parser/gram.y | 8 + src/backend/parser/parse_coerce.c | 17 +- src/backend/parser/parse_type.c | 16 + src/backend/tcop/utility.c | 13 + src/backend/utils/adt/Makefile | 1 + src/backend/utils/adt/dictionaries.c | 479 ++++++++++++++++++ src/backend/utils/fmgr/funcapi.c | 1 + src/bin/pg_dump/pg_dump.c | 2 + src/include/catalog/pg_dict.h | 76 +++ src/include/catalog/pg_proc.dat | 20 + src/include/catalog/pg_type.h | 1 + src/include/commands/typecmds.h | 1 + src/include/nodes/nodes.h | 1 + src/include/nodes/parsenodes.h | 12 + src/pl/plpgsql/src/pl_comp.c | 1 + .../modules/test_oat_hooks/test_oat_hooks.c | 3 + src/test/regress/expected/dict.out | 47 ++ src/test/regress/expected/oidjoins.out | 1 + src/test/regress/expected/opr_sanity.out | 14 +- src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/dict.sql | 20 + 27 files changed, 1092 insertions(+), 11 deletions(-) create mode 100644 src/backend/catalog/pg_dict.c create mode 100644 src/backend/utils/adt/dictionaries.c create mode 100644 src/include/catalog/pg_dict.h create mode 100644 src/test/regress/expected/dict.out create mode 100644 src/test/regress/sql/dict.sql diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 89a0221ec9..3b4330148e 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -33,6 +33,7 @@ OBJS = \ pg_conversion.o \ pg_db_role_setting.o \ pg_depend.o \ + pg_dict.o \ pg_enum.o \ pg_inherits.o \ pg_largeobject.o \ @@ -61,7 +62,7 @@ CATALOG_HEADERS := \ pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \ pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \ pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \ - pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \ + pg_cast.h pg_dict.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \ pg_database.h pg_db_role_setting.h pg_tablespace.h \ pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \ pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \ diff --git a/src/backend/catalog/pg_dict.c b/src/backend/catalog/pg_dict.c new file mode 100644 index 0000000000..ef76e565cd --- /dev/null +++ b/src/backend/catalog/pg_dict.c @@ -0,0 +1,175 @@ +/*------------------------------------------------------------------------- + * + * pg_dict.c + * routines to support manipulation of the pg_dict relation + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/catalog/pg_dict.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/genam.h" +#include "access/skey.h" +#include "access/table.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/pg_dict.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/rel.h" + +/* + * Creates an entry in pg_dict for each of the supplied values. + * vals is a list of String values. + */ +void +DictEntriesCreate(Oid dictTypeOid, List *vals) +{ + Relation pg_dict; + NameData dictentry; + Datum values[Natts_pg_dict]; + bool nulls[Natts_pg_dict]; + ListCell *lc; + HeapTuple tup; + + if (vals == NIL) + { + /* The list is empty; do nothing. */ + return; + } + + memset(nulls, false, sizeof(nulls)); + + /* + * We don't check the list of values for duplicates here. If there are + * any, the user will get an unique-index violation. + */ + + pg_dict = table_open(DictRelationId, RowExclusiveLock); + foreach(lc, vals) + { + Oid oid = GetNewOidWithIndex(pg_dict, DictOidIndexId, + Anum_pg_dict_oid); + char *entry = strVal(lfirst(lc)); + + /* + * Entries are stored in a name field, for easier syscache lookup, so + * check the length to make sure it's within range. + */ + if (strlen(entry) > (NAMEDATALEN - 1)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid dict entry \"%s\"", entry), + errdetail("Entries must be %d bytes or less.", + NAMEDATALEN - 1))); + + namestrcpy(&dictentry, entry); + + values[Anum_pg_dict_oid - 1] = ObjectIdGetDatum(oid); + values[Anum_pg_dict_dicttypid - 1] = ObjectIdGetDatum(dictTypeOid); + values[Anum_pg_dict_dictentry - 1] = NameGetDatum(&dictentry); + + tup = heap_form_tuple(RelationGetDescr(pg_dict), values, nulls); + + CatalogTupleInsert(pg_dict, tup); + heap_freetuple(tup); + } + + /* clean up */ + table_close(pg_dict, RowExclusiveLock); +} + +/* + * Returns all the entries for the dictinary with given Oid. Entries are sorted + * by dictentry. Note that shorter entries are considered smaller, i.e. 'abc' + * goes before 'abcdef'. The memory is allocated in the caller's memory context. + * + * If there are no entries a valid but empty dictionary is returned. + */ +Dictionary +DictEntriesRead(Oid dictTypeOid) +{ + Relation pg_dict; + ScanKeyData key[1]; + SysScanDesc scan; + HeapTuple tup; + uint32 entries_allocated = 8; + Dictionary dict = (Dictionary)palloc(sizeof(DictionaryData)); + + dict->nentries = 0; + dict->entries = (DictEntry*)palloc(entries_allocated*sizeof(DictEntry)); + + pg_dict = table_open(DictRelationId, RowExclusiveLock); + + ScanKeyInit(&key[0], + Anum_pg_dict_dicttypid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(dictTypeOid)); + + scan = systable_beginscan(pg_dict, DictTypIdEntryIndexId, true, + NULL, 1, key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + if(dict->nentries == entries_allocated) + { + entries_allocated = entries_allocated * 2; + dict->entries = (DictEntry*)repalloc(dict->entries, entries_allocated*sizeof(DictEntry)); + } + + dict->entries[dict->nentries].oid = ((Form_pg_dict) GETSTRUCT(tup))->oid; + dict->entries[dict->nentries].dictentry = ((Form_pg_dict) GETSTRUCT(tup))->dictentry; + /* + * This is arguably not the fastest way to determine the length of + * the entry. Alternatively, we could store a precalculated value in + * the catalog. However, usually it's a good idea to keep things simple + * until somebody discovers a bottleneck in this exact place and + * proposes a concrete fix. + */ + dict->entries[dict->nentries].length = (uint32)strlen(dict->entries[dict->nentries].dictentry.data); + dict->nentries++; + } + + systable_endscan(scan); + table_close(pg_dict, RowExclusiveLock); + + return dict; +} + +/* + * Deletes all the entries for the dictinary with given Oid. + */ +void +DictEntriesDelete(Oid dictTypeOid) +{ + Relation pg_dict; + ScanKeyData key[1]; + SysScanDesc scan; + HeapTuple tup; + + pg_dict = table_open(DictRelationId, RowExclusiveLock); + + ScanKeyInit(&key[0], + Anum_pg_dict_dicttypid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(dictTypeOid)); + + scan = systable_beginscan(pg_dict, DictTypIdEntryIndexId, true, + NULL, 1, key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + CatalogTupleDelete(pg_dict, &tup->t_self); + } + + systable_endscan(scan); + table_close(pg_dict, RowExclusiveLock); +} diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 9b92b04242..3115fb7336 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -46,6 +46,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" #include "catalog/pg_depend.h" +#include "catalog/pg_dict.h" #include "catalog/pg_enum.h" #include "catalog/pg_language.h" #include "catalog/pg_namespace.h" @@ -679,6 +680,14 @@ RemoveTypeById(Oid typeOid) if (((Form_pg_type) GETSTRUCT(tup))->typtype == TYPTYPE_RANGE) RangeDelete(typeOid); + /* + * If it is a dictionary type, delete the pg_dict entries too; we don't + * bother with making a dependency entry for that, so it has to be done + * "by hand" here. + */ + if (((Form_pg_type) GETSTRUCT(tup))->typtype == TYPTYPE_DICT) + DictEntriesDelete(typeOid); + ReleaseSysCache(tup); table_close(relation, RowExclusiveLock); @@ -763,8 +772,8 @@ DefineDomain(CreateDomainStmt *stmt) /* * Base type must be a plain base type, a composite type, another domain, - * an enum or a range type. Domains over pseudotypes would create a - * security hole. (It would be shorter to code this to just check for + * a dict, an enum or a range type. Domains over pseudotypes would create + * a security hole. (It would be shorter to code this to just check for * pseudotypes; but it seems safer to call out the specific typtypes that * are supported, rather than assume that all future typtypes would be * automatically supported.) @@ -772,6 +781,7 @@ DefineDomain(CreateDomainStmt *stmt) typtype = baseType->typtype; if (typtype != TYPTYPE_BASE && typtype != TYPTYPE_COMPOSITE && + typtype != TYPTYPE_DICT && typtype != TYPTYPE_DOMAIN && typtype != TYPTYPE_ENUM && typtype != TYPTYPE_RANGE && @@ -1129,6 +1139,152 @@ DefineDomain(CreateDomainStmt *stmt) } +/* + * DefineDictionary + * Registers a new dictionary. + */ +ObjectAddress +DefineDictionary(CreateDictionaryStmt *stmt) +{ + char *dictName; + char *dictBaseTypeName; + char *dictArrayName; + Oid dictNamespace; + AclResult aclresult; + Oid old_type_oid; + Oid dictArrayOid; + ObjectAddress dictTypeAddr; + + Assert(list_length(stmt->baseTypeName) == 1); + dictBaseTypeName = strVal(linitial(stmt->baseTypeName)); + if(pg_strcasecmp(dictBaseTypeName, "jsonb") != 0) + ereport(ERROR, + ( + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Dictionary types currently can be used only with JSONB") + ) + ); + + /* Convert list of names to a name and namespace */ + dictNamespace = QualifiedNameGetCreationNamespace(stmt->typeName, + &dictName); + + /* Check we have creation rights in target namespace */ + aclresult = pg_namespace_aclcheck(dictNamespace, GetUserId(), ACL_CREATE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_SCHEMA, + get_namespace_name(dictNamespace)); + + /* + * Check for collision with an existing type name. If there is one and + * it's an autogenerated array, we can rename it out of the way. + */ + old_type_oid = GetSysCacheOid2(TYPENAMENSP, Anum_pg_type_oid, + CStringGetDatum(dictName), + ObjectIdGetDatum(dictNamespace)); + if (OidIsValid(old_type_oid)) + { + if (!moveArrayTypeName(old_type_oid, dictName, dictNamespace)) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("type \"%s\" already exists", dictName))); + } + + /* Allocate OID for array type */ + dictArrayOid = AssignTypeArrayOid(); + + /* Create the pg_type entry */ + dictTypeAddr = + TypeCreate(InvalidOid, /* no predetermined type OID */ + dictName, /* type name */ + dictNamespace, /* namespace */ + InvalidOid, /* relation oid (n/a here) */ + 0, /* relation kind (ditto) */ + GetUserId(), /* owner's ID */ + -1, /* internal size: varlena, as for JSONB */ + TYPTYPE_DICT, /* type-type: dictionary */ + TYPCATEGORY_USER, /* type-category: user, as for JSONB */ + false, /* dict types are never preferred */ + DEFAULT_TYPDELIM, /* array element delimiter */ + F_DICTIONARY_IN, /* input procedure */ + F_DICTIONARY_OUT, /* output procedure */ + InvalidOid, /* receive procedure: none */ + InvalidOid, /* send procedure: none */ + InvalidOid, /* typmodin procedure - none */ + InvalidOid, /* typmodout procedure - none */ + InvalidOid, /* analyze procedure - default */ + InvalidOid, /* subscript procedure - none */ + InvalidOid, /* element type ID */ + false, /* this is not an array type */ + dictArrayOid, /* array type we are about to create */ + InvalidOid, /* base type ID (only for domains) */ + NULL, /* never a default type value */ + NULL, /* binary default isn't sent either */ + false, /* passed by value: same as for JSONB */ + TYPALIGN_INT, /* int alignment */ + TYPSTORAGE_EXTENDED, /* TOAST strategy: fully toastable, as + * JSONB */ + -1, /* typMod (Domains only) */ + 0, /* Array dimensions of typbasetype */ + false, /* Type NOT NULL */ + InvalidOid); /* type's collation */ + + /* + * Create the array type that goes with it. + */ + dictArrayName = makeArrayTypeName(dictName, dictNamespace); + + TypeCreate(dictArrayOid, /* force assignment of this type OID */ + dictArrayName, /* type name */ + dictNamespace, /* namespace */ + InvalidOid, /* relation oid (n/a here) */ + 0, /* relation kind (ditto) */ + GetUserId(), /* owner's ID */ + -1, /* internal size (always varlena) */ + TYPTYPE_BASE, /* type-type (base type) */ + TYPCATEGORY_ARRAY, /* type-category (array) */ + false, /* array types are never preferred */ + DEFAULT_TYPDELIM, /* array element delimiter */ + F_ARRAY_IN, /* input procedure */ + F_ARRAY_OUT, /* output procedure */ + F_ARRAY_RECV, /* receive procedure */ + F_ARRAY_SEND, /* send procedure */ + InvalidOid, /* typmodin procedure - none */ + InvalidOid, /* typmodout procedure - none */ + F_ARRAY_TYPANALYZE, /* analyze procedure */ + F_ARRAY_SUBSCRIPT_HANDLER, /* array subscript procedure */ + dictTypeAddr.objectId, /* element type ID */ + true, /* yes this is an array type */ + InvalidOid, /* no further array type */ + InvalidOid, /* base type ID */ + NULL, /* never a default type value */ + NULL, /* binary default isn't sent either */ + false, /* passed by value: same as for JSONB */ + TYPALIGN_INT, /* enums have int align, so do their arrays */ + TYPSTORAGE_EXTENDED, /* ARRAY is always toastable */ + -1, /* typMod (Domains only) */ + 0, /* Array dimensions of typbasetype */ + false, /* Type NOT NULL */ + InvalidOid); /* type's collation */ + + pfree(dictArrayName); + + /* Enter the dict's entries into pg_dict */ + DictEntriesCreate(dictTypeAddr.objectId, stmt->vals); + + /* Create casts to and from JSONB */ + CastCreate(JSONBOID, dictTypeAddr.objectId, F_JSONB_DICTIONARY, 'a', 'f', DEPENDENCY_INTERNAL); + CastCreate(dictTypeAddr.objectId, JSONBOID, F_DICTIONARY_JSONB, 'i', 'f', DEPENDENCY_INTERNAL); + + /* + * Create explicit cast to bytea. This is convenient for debugging purposes. + * Casting bytea to a dictionary type is dangerous and thus not supported. + */ + CastCreate(dictTypeAddr.objectId, BYTEAOID, F_DICTIONARY_BYTEA, 'e', 'f', DEPENDENCY_INTERNAL); + + return dictTypeAddr; +} + /* * DefineEnum * Registers a new enum. diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index f9460ae506..7ad3de3a00 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -1708,6 +1708,7 @@ check_sql_fn_retval(List *queryTreeLists, if (fn_typtype == TYPTYPE_BASE || fn_typtype == TYPTYPE_DOMAIN || + fn_typtype == TYPTYPE_DICT || fn_typtype == TYPTYPE_ENUM || fn_typtype == TYPTYPE_RANGE || fn_typtype == TYPTYPE_MULTIRANGE) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 205506305b..9905a1a828 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4440,6 +4440,18 @@ _copyCompositeTypeStmt(const CompositeTypeStmt *from) return newnode; } +static CreateDictionaryStmt * +_copyCreateDictionaryStmt(const CreateDictionaryStmt *from) +{ + CreateDictionaryStmt *newnode = makeNode(CreateDictionaryStmt); + + COPY_NODE_FIELD(typeName); + COPY_NODE_FIELD(baseTypeName); + COPY_NODE_FIELD(vals); + + return newnode; +} + static CreateEnumStmt * _copyCreateEnumStmt(const CreateEnumStmt *from) { @@ -6181,6 +6193,9 @@ copyObjectImpl(const void *from) case T_CompositeTypeStmt: retval = _copyCompositeTypeStmt(from); break; + case T_CreateDictionaryStmt: + retval = _copyCreateDictionaryStmt(from); + break; case T_CreateEnumStmt: retval = _copyCreateEnumStmt(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 9688b22a4b..357b10a106 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1962,6 +1962,16 @@ _equalCompositeTypeStmt(const CompositeTypeStmt *a, const CompositeTypeStmt *b) return true; } +static bool +_equalCreateDictionaryStmt(const CreateDictionaryStmt *a, const CreateDictionaryStmt *b) +{ + COMPARE_NODE_FIELD(typeName); + COMPARE_NODE_FIELD(baseTypeName); + COMPARE_NODE_FIELD(vals); + + return true; +} + static bool _equalCreateEnumStmt(const CreateEnumStmt *a, const CreateEnumStmt *b) { @@ -3983,6 +3993,9 @@ equal(const void *a, const void *b) case T_CompositeTypeStmt: retval = _equalCompositeTypeStmt(a, b); break; + case T_CreateDictionaryStmt: + retval = _equalCreateDictionaryStmt(a, b); + break; case T_CreateEnumStmt: retval = _equalCreateEnumStmt(a, b); break; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 989db0dbec..5c8a7e4586 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -6237,6 +6237,14 @@ DefineStmt: n->coldeflist = $6; $$ = (Node *) n; } + | CREATE TYPE_P any_name AS DICTIONARY OF any_name '(' opt_enum_val_list ')' + { + CreateDictionaryStmt *n = makeNode(CreateDictionaryStmt); + n->typeName = $3; + n->baseTypeName = $7; + n->vals = $9; + $$ = (Node *)n; + } | CREATE TYPE_P any_name AS ENUM_P '(' opt_enum_val_list ')' { CreateEnumStmt *n = makeNode(CreateEnumStmt); diff --git a/src/backend/parser/parse_coerce.c b/src/backend/parser/parse_coerce.c index c4e958e4aa..e489dd9421 100644 --- a/src/backend/parser/parse_coerce.c +++ b/src/backend/parser/parse_coerce.c @@ -279,7 +279,22 @@ coerce_type(ParseState *pstate, Node *node, if (baseTypeId == INTERVALOID) inputTypeMod = baseTypeMod; else - inputTypeMod = -1; + { + if((int32)targetTypeId == targetTypeMod) + { + /* + * For dictionaries internally we set typmod to a dictionary + * OID. This allows dictionary_in() and jsonb_dictionary() to + * distinguish one dictionary from another. + * + * It also allows us to recognize that the target type is + * a dictionary here and pass a correct typmod. + */ + inputTypeMod = targetTypeMod; + } + else + inputTypeMod = -1; + } baseType = typeidType(baseTypeId); diff --git a/src/backend/parser/parse_type.c b/src/backend/parser/parse_type.c index 307114a30d..868b24fd9f 100644 --- a/src/backend/parser/parse_type.c +++ b/src/backend/parser/parse_type.c @@ -259,6 +259,10 @@ LookupTypeNameOid(ParseState *pstate, const TypeName *typeName, bool missing_ok) * This is equivalent to LookupTypeName, except that this will report * a suitable error message if the type cannot be found or is not defined. * Callers of this can therefore assume the result is a fully valid type. + * + * For a dictionary type it's OID is returned as a typmod. This is used by + * dictionary_in() and jsonb_dictionary() to distinguish one dictionary from + * another. */ Type typenameType(ParseState *pstate, const TypeName *typeName, int32 *typmod_p) @@ -305,6 +309,10 @@ typenameTypeId(ParseState *pstate, const TypeName *typeName) * * This is equivalent to typenameType, but we only hand back the type OID * and typmod, not the syscache entry. + * + * For a dictionary type it's OID is returned as a typmod. This is used by + * dictionary_in() and jsonb_dictionary() to distinguish one dictionary from + * another. */ void typenameTypeIdAndMod(ParseState *pstate, const TypeName *typeName, @@ -327,6 +335,10 @@ typenameTypeIdAndMod(ParseState *pstate, const TypeName *typeName, * looked up, and is passed as "typ". * * pstate is only used for error location info, and may be NULL. + * + * For a dictionary type it's OID is returned as a typmod. This is used by + * dictionary_in() and jsonb_dictionary() to distinguish one dictionary from + * another. */ static int32 typenameTypeMod(ParseState *pstate, const TypeName *typeName, Type typ) @@ -339,6 +351,10 @@ typenameTypeMod(ParseState *pstate, const TypeName *typeName, Type typ) ArrayType *arrtypmod; ParseCallbackState pcbstate; + /* For a dictionary return it's OID. */ + if (((Form_pg_type) GETSTRUCT(typ))->typtype == TYPTYPE_DICT) + return (int32)((Form_pg_type) GETSTRUCT(typ))->oid; + /* Return prespecified typmod if no typmod expressions */ if (typeName->typmods == NIL) return typeName->typemod; diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 0e7b7b3138..cffef5d707 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -174,6 +174,7 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) case T_CreateCastStmt: case T_CreateConversionStmt: case T_CreateDomainStmt: + case T_CreateDictionaryStmt: case T_CreateEnumStmt: case T_CreateEventTrigStmt: case T_CreateExtensionStmt: @@ -1621,6 +1622,10 @@ ProcessUtilitySlow(ParseState *pstate, } break; + case T_CreateDictionaryStmt: /* CREATE TYPE AS DICTIONARY OF */ + address = DefineDictionary((CreateDictionaryStmt *) parsetree); + break; + case T_CreateEnumStmt: /* CREATE TYPE AS ENUM */ address = DefineEnum((CreateEnumStmt *) parsetree); break; @@ -2766,6 +2771,10 @@ CreateCommandTag(Node *parsetree) tag = CMDTAG_CREATE_TYPE; break; + case T_CreateDictionaryStmt: + tag = CMDTAG_CREATE_TYPE; + break; + case T_CreateEnumStmt: tag = CMDTAG_CREATE_TYPE; break; @@ -3413,6 +3422,10 @@ GetCommandLogLevel(Node *parsetree) lev = LOGSTMT_DDL; break; + case T_CreateDictionaryStmt: + lev = LOGSTMT_DDL; + break; + case T_CreateEnumStmt: lev = LOGSTMT_DDL; break; diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 7c722ea2ce..a7ec3d89f3 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -30,6 +30,7 @@ OBJS = \ datetime.o \ datum.o \ dbsize.o \ + dictionaries.o \ domains.o \ encode.o \ enum.o \ diff --git a/src/backend/utils/adt/dictionaries.c b/src/backend/utils/adt/dictionaries.c new file mode 100644 index 0000000000..364ed7cb20 --- /dev/null +++ b/src/backend/utils/adt/dictionaries.c @@ -0,0 +1,479 @@ +/*------------------------------------------------------------------------- + * + * dictionaries.c + * Conversion functions for dictionary types. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/dictionaries.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/pg_dict.h" +#include "utils/fmgrprotos.h" +#include "utils/jsonb.h" + +/* + * When compressing a data we treat it as a BLOB, in other words we don't + * assume anything regarding its internal representation. This is not + * necessarily the best and/or the only possible approach. However, it is + * universal and can be reused for JSONB, TEXT, XML and other types. Alternative + * compression methods can be introduced in the future, so that the user will + * be able to choose the best one for the task. + * + * The compressed data is stored in the following format: + * + * (struct varlena) + * dictionary_id [uint32] + * decompressed_size [uint32] + * algorithm_version [uint8] + * + * (repeated) { + * number of bytes to copy as-is [uint8] + * ... bytes to copy as-is ... + * dictionary entry id, or 0 to skip [uint32] + * } + * + * Compressed data is a variable-length type and thus has the 'struct varlena' + * header. The code below doesn't consider it being a part of the payload + * (e.g. see the DICT_COMP_HEADER_SIZE definition). + * + * Storing dictionary_id may seem redundant, but without it dictionary_out() + * and dictionary_jsonb() have no way to distinguish one dictionary type from + * another. dictionary_in() and jsonb_dictionary() get the dictionary id through + * the 'typemod' argument. + * + * Currently algorithm_version is always 0. In the future it will allow us to + * introduce new features (e.g. usage of varints) and/or lazily migrate the + * data to other compression methods. + * + * Compression and decompression are implemented based on a binary search over + * DictEntry[] array (see pg_dict.h). For compression the array is sorted by + * dictentries and for decompression - by oid's. + * + */ + +/* Size of the header in the compressed data */ +#define DICT_COMP_HEADER_SIZE (sizeof(uint32)*2 + sizeof(uint8)) + +/* Extracts dictionary_id from the compressed data */ +#define DICT_COMP_DICTIONARY_ID(hdrp) \ + (*(uint32*)hdrp) + +/* Extracts decompressed_size from the compressed data */ +#define DICT_COMP_DECOMPRESSED_SIZE(hdrp) \ + (*(uint32*)((uint8*)hdrp + sizeof(uint32))) + +/* Extracts algorithm_version from the compressed data */ +#define DICT_COMP_ALGORITHM_VERSION(hdrp) \ + (*(uint8*)((uint8*)hdrp + sizeof(uint32)*2)) + +/* Current algorithm_version */ +#define DICT_COMP_CURRENT_ALGORITHM_VERSION 0 + +/* + * bsearch_arg() callback for finding DictEntry by oid. + */ +static int +find_by_oid_cb(const void *key, const void *current, void *arg) +{ + /* Note that oids are unsigned, so we should be careful here */ + if (*(Oid *) key < ((DictEntry *) current)->oid) + return -1; + else if (*(Oid *) key == ((DictEntry *) current)->oid) + return 0; /* found! */ + else + return 1; +} + +/* + * qsort() callback for sorting DictEntry[] by oids. + */ +static int +sort_by_oids_cb(const void *left, const void *right) +{ + /* Note that oids are unsigned, so we should be careful here */ + if (((DictEntry *) left)->oid < ((DictEntry *) right)->oid) + return -1; + + /* Oids are unique so this callback will never return 0 */ + return 1; +} + +/* + * Finds a DictEntry which dictentry field matches *data and returns its Oid. + * If there are several matching entries, the largest is returned. The length + * of the found entry is written to *found_length on success. On failure + * InvalidOid is returned and found_length is zeroed. + * + * The implementation is similar to bsearch_arg(). The procedure can't be used + * directly because we are looking not by the exact match. We could generalize + * this case but the signature of the function becomes so complicated that it + * doesn't seem to worth the effort. + */ +static Oid +compress_find_oid(Dictionary dict, const uint8 *data, Size data_size, Size *found_length) +{ + int res; + int32 left = 0; + int32 right = dict->nentries - 1; + Size best_length = 0; + Oid best_match = InvalidOid; + + while (left <= right) + { + int32 current = (left + right) / 2; + Size nbytes = (Size)dict->entries[current].length; + + if (nbytes > data_size) + { + /* current can be less or greater depending on the prefix */ + res = memcmp(dict->entries[current].dictentry.data, data, data_size); + + /* if prefixes match, current is greater */ + if (res == 0) + res = 1; + } + else + res = memcmp(dict->entries[current].dictentry.data, data, nbytes); + + if (res == 0) /* match found */ + { + best_length = nbytes; + best_match = dict->entries[current].oid; + + if (nbytes == data_size) + break; + + /* maybe there is a larger match */ + left = current + 1; + } + else if (res < 0) /* current is less */ + left = current + 1; + else /* current is greater */ + right = current - 1; + } + + *found_length = best_length; + return best_match; +} + +/* + * Finds a DictEntry by Oid using a binary search. The dictionary should be + * sorted by oids before the call. Returns NULL if nothing was found. + */ +static DictEntry * +decompress_find_dictentry(Dictionary dict, Oid oid) +{ + return (DictEntry *) bsearch_arg(&oid, dict->entries, dict->nentries, sizeof(DictEntry), find_by_oid_cb, NULL); +} + +/* + * Estimates the worst-case compressed size of the data of given size. + * Worst-case scenario happens when the dictionary consists of single-character + * entries. In this case every byte will be encoded as 6 bytes: + * 0x00, (0 bytes to copy as-is), 4 bytes of the entry Oid + * + * This procedure doesn't account for the header size. + * + * AALEKSEEV FIXME don't use dictionary entries shorter than 5 bytes + */ +static Size +worst_case_compressed_size(Size insize) +{ + return insize * 6; +} + +/* + * Compresses the data using the provided dictionary. The dictionary should + * be sorted by dictentries before the call. Output buffer should be at least + * worst_case_compressed_size(src_size) bytes in size. + */ +static void +compress(Dictionary dict, + const void *src_data_, Size src_size, + void *encoded_data_, Size *pencoded_size) +{ + Size nbytes; + Size inoffset; + Size outskipoffset = 0; + Size outoffset = 1; + uint8 skipbytes = 0; + const uint8 *src_data = src_data_; + uint8 *encoded_data = ((uint8 *) encoded_data_); + + for (inoffset = 0; inoffset < src_size;) + { + Oid code = compress_find_oid(dict, &(src_data[inoffset]), + src_size - inoffset, &nbytes); + + if (code == InvalidOid) + { + skipbytes++; + encoded_data[outoffset] = src_data[inoffset]; + outoffset++; + inoffset++; + + if (skipbytes == 255) + { + encoded_data[outskipoffset] = skipbytes; + encoded_data[outoffset++] = 0; /* InvalidOid */ + encoded_data[outoffset++] = 0; + encoded_data[outoffset++] = 0; + encoded_data[outoffset++] = 0; + outskipoffset = outoffset++; + skipbytes = 0; + } + } + else + { + encoded_data[outskipoffset] = skipbytes; + encoded_data[outoffset++] = (code >> 24) & 0xFF; + encoded_data[outoffset++] = (code >> 16) & 0xFF; + encoded_data[outoffset++] = (code >> 8) & 0xFF; + encoded_data[outoffset++] = code & 0xFF; + outskipoffset = outoffset++; + skipbytes = 0; + inoffset += nbytes; + } + } + + /* Double check that we didn't write out of buffer */ + Assert(outoffset < worst_case_compressed_size(src_size)); + + encoded_data[outskipoffset] = skipbytes; + *pencoded_size = outoffset; +} + +/* + * Report an internal error in decompress() procedure below. + * Under normal circumstances this should never happen. + */ +static void +decompress_error() +{ + ereport(ERROR, + ( + errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unable to decompress a dictionary type"), + errdetail("The compressed data seems to be corrupted"), + errhint("Please report the steps to reproduce the issue to pgsql-bugs@") + )); +} + +/* + * Decompresses the data using the provided dictionary. The dictionary should + * be sorted by oids before the call. + */ +static void +decompress(Dictionary dict, + const void *encoded_data_, Size encoded_size, + void *decoded_data_, Size decoded_size) +{ + Size inoffset = 0; + Size outoffset = 0; + Oid code; + uint8 skipbytes; + const uint8 *encoded_data = ((uint8 *) encoded_data_); + uint8 *decoded_data = decoded_data_; + + for (inoffset = 0; inoffset < encoded_size;) + { + skipbytes = encoded_data[inoffset++]; + + if (skipbytes > decoded_size - outoffset) + decompress_error(); + + if (skipbytes > encoded_size - inoffset) + decompress_error(); + + memcpy( + &(decoded_data[outoffset]), + &(encoded_data[inoffset]), + skipbytes + ); + + outoffset += skipbytes; + inoffset += skipbytes; + + if ((encoded_size == inoffset) && (decoded_size == outoffset)) + break; /* end of input - its OK */ + + if (encoded_size - inoffset < 4) + decompress_error(); + + code = (Oid) encoded_data[inoffset++]; + code = (code << 8) | (Oid) encoded_data[inoffset++]; + code = (code << 8) | (Oid) encoded_data[inoffset++]; + code = (code << 8) | (Oid) encoded_data[inoffset++]; + + if (code != InvalidOid) + { + Size entrylen; + DictEntry *entry = decompress_find_dictentry(dict, code); + + if (entry == NULL) + decompress_error(); + + Assert(entry->oid == code); + entrylen = (Size)entry->length; + + if (entrylen > decoded_size - outoffset) + decompress_error(); + + memcpy( + &(decoded_data[outoffset]), + entry->dictentry.data, + entrylen + ); + + outoffset += entrylen; + } + } + + Assert(decoded_size == outoffset); +} + +/* + * Converts a cstring to a dictionary. + */ +Datum +dictionary_in(PG_FUNCTION_ARGS) +{ + const char *instr = PG_GETARG_CSTRING(0); +#ifdef NOT_USED + Oid typelem = PG_GETARG_OID(1); +#endif + int32 typmod = PG_GETARG_INT32(2); + Jsonb *jsonb; + bytea *dict; + + Assert(typmod != -1); + + jsonb = DatumGetJsonbP(DirectFunctionCall1(jsonb_in, CStringGetDatum(instr))); + dict = DatumGetByteaP(DirectFunctionCall2(jsonb_dictionary, JsonbPGetDatum(jsonb), Int32GetDatum(typmod))); + + PG_RETURN_BYTEA_P(dict); +} + +/* + * Converts a dictionary to a cstring. + */ +Datum +dictionary_out(PG_FUNCTION_ARGS) +{ + bytea *dict = PG_GETARG_BYTEA_P(0); + Jsonb *jsonb = DatumGetJsonbP(DirectFunctionCall1(dictionary_jsonb, PointerGetDatum(dict))); + const char *outstr = DatumGetCString(DirectFunctionCall1(jsonb_out, JsonbPGetDatum(jsonb))); + + PG_RETURN_CSTRING(outstr); +} + +/* + * Coverts JSONB to a dictionary type. + * + * AALEKSEEV FIXME: if the compressed document ends up being larger than + * the original one write a corresponding flag and copy all the data as-is. + */ +Datum +jsonb_dictionary(PG_FUNCTION_ARGS) +{ + Dictionary dict; + Jsonb *jsonb = PG_GETARG_JSONB_P(0); + int32 typmod = PG_GETARG_INT32(1); + uint8 *jsonb_data = (uint8 *) VARDATA(jsonb); + Size jsonb_data_size = VARSIZE(jsonb) - VARHDRSZ; + uint8 *encoded_buff, + *encoded_header, + *encoded_data; + Size encoded_size, + encoded_buff_size; + + Assert(typmod != -1); + + dict = DictEntriesRead(typmod); + qsort((void *) dict->entries, dict->nentries, sizeof(DictEntry), sort_by_oids_cb); + + encoded_buff_size = VARHDRSZ + DICT_COMP_HEADER_SIZE + worst_case_compressed_size(jsonb_data_size); + encoded_buff = palloc(encoded_buff_size); + encoded_header = (uint8 *) VARDATA(encoded_buff); + encoded_data = encoded_header + DICT_COMP_HEADER_SIZE; + + DICT_COMP_DICTIONARY_ID(encoded_header) = typmod; + DICT_COMP_DECOMPRESSED_SIZE(encoded_header) = jsonb_data_size; + DICT_COMP_ALGORITHM_VERSION(encoded_header) = DICT_COMP_CURRENT_ALGORITHM_VERSION; + + encoded_size = encoded_buff_size - VARHDRSZ - DICT_COMP_HEADER_SIZE; + + compress(dict, jsonb_data, jsonb_data_size, + encoded_data, &encoded_size); + + encoded_size += VARHDRSZ + DICT_COMP_HEADER_SIZE; + + encoded_buff = repalloc(encoded_buff, encoded_size); + SET_VARSIZE(encoded_buff, encoded_size); + + pfree(dict); + PG_RETURN_BYTEA_P(encoded_buff); +} + +/* + * Converts a dictionary type to JSONB. + */ +Datum +dictionary_jsonb(PG_FUNCTION_ARGS) +{ + bytea *encoded_buff = PG_GETARG_BYTEA_P(0); + uint8 *encoded_header = (uint8 *) VARDATA(encoded_buff); + uint8 *encoded_data = encoded_header + DICT_COMP_HEADER_SIZE; + Size encoded_size = VARSIZE(encoded_buff) - VARHDRSZ - DICT_COMP_HEADER_SIZE; + int alg_version = DICT_COMP_ALGORITHM_VERSION(encoded_header); + Oid dictOid; /* cannot read until algorithm version is + * checked */ + uint32 decoded_size; /* cannot read until algorithm version is + * checked */ + Jsonb *jsonb; + uint8 *jsonb_data; + Dictionary dict; + + if (alg_version > DICT_COMP_CURRENT_ALGORITHM_VERSION) + ereport(ERROR, + ( + errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unsupported compression algorithm version"), + errdetail("Saved algorithm version is %d, current version is %d", + alg_version, DICT_COMP_CURRENT_ALGORITHM_VERSION), + errhint("The data is either corrupted or imported from " + "the future version of PostgreSQL") + )); + + dictOid = DICT_COMP_DICTIONARY_ID(encoded_header); + decoded_size = DICT_COMP_DECOMPRESSED_SIZE(encoded_header); + dict = DictEntriesRead(dictOid); + + jsonb = palloc(decoded_size + VARHDRSZ); + jsonb_data = (uint8 *) VARDATA(jsonb); + + decompress(dict, encoded_data, encoded_size, jsonb_data, decoded_size); + + decoded_size += VARHDRSZ; + SET_VARSIZE(jsonb, decoded_size); + + pfree(dict); + PG_RETURN_JSONB_P(jsonb); +} + +/* + * Converts a dictionary type to bytea. + */ +Datum +dictionary_bytea(PG_FUNCTION_ARGS) +{ + bytea *compressed_data = PG_GETARG_BYTEA_P(0); + PG_RETURN_BYTEA_P(compressed_data); +} \ No newline at end of file diff --git a/src/backend/utils/fmgr/funcapi.c b/src/backend/utils/fmgr/funcapi.c index 9197b0f1e2..4f9dc52428 100644 --- a/src/backend/utils/fmgr/funcapi.c +++ b/src/backend/utils/fmgr/funcapi.c @@ -1292,6 +1292,7 @@ get_type_func_class(Oid typid, Oid *base_typeid) case TYPTYPE_COMPOSITE: return TYPEFUNC_COMPOSITE; case TYPTYPE_BASE: + case TYPTYPE_DICT: case TYPTYPE_ENUM: case TYPTYPE_RANGE: case TYPTYPE_MULTIRANGE: diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 7cc9c72e49..1c85b3f7b0 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -10178,6 +10178,8 @@ dumpType(Archive *fout, const TypeInfo *tyinfo) dumpDomain(fout, tyinfo); else if (tyinfo->typtype == TYPTYPE_COMPOSITE) dumpCompositeType(fout, tyinfo); + else if (tyinfo->typtype == TYPTYPE_DICT) + pg_log_error("AALEKSEEV TODO FIXME not implemented"); else if (tyinfo->typtype == TYPTYPE_ENUM) dumpEnumType(fout, tyinfo); else if (tyinfo->typtype == TYPTYPE_RANGE) diff --git a/src/include/catalog/pg_dict.h b/src/include/catalog/pg_dict.h new file mode 100644 index 0000000000..d05303f724 --- /dev/null +++ b/src/include/catalog/pg_dict.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * pg_dict.h + * definition of the "dict" system catalog (pg_dict) + * + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * src/include/catalog/pg_dict.h + * + * NOTES + * The Catalog.pm module reads this file and derives schema + * information. + * + *------------------------------------------------------------------------- + */ +#ifndef PG_DICT_H +#define PG_DICT_H + +#include "catalog/genbki.h" +#include "catalog/pg_dict_d.h" + +#include "nodes/pg_list.h" + +/* ---------------- + * pg_dict definition. cpp turns this into + * typedef struct FormData_pg_dict + * ---------------- + */ +CATALOG(pg_dict,9861,DictRelationId) +{ + Oid oid; /* oid */ + Oid dicttypid BKI_LOOKUP(pg_type); /* OID of owning dict type */ + NameData dictentry; /* text representation of the dictionary entry */ +} FormData_pg_dict; + +/* ---------------- + * Form_pg_dict corresponds to a pointer to a tuple with + * the format of pg_dict relation. + * ---------------- + */ +typedef FormData_pg_dict *Form_pg_dict; + +DECLARE_UNIQUE_INDEX_PKEY(pg_dict_oid_index, 9862, DictOidIndexId, on pg_dict using btree(oid oid_ops)); +DECLARE_UNIQUE_INDEX(pg_dict_typid_entry_index, 9863, DictTypIdEntryIndexId, on pg_dict using btree(dicttypid oid_ops, dictentry name_ops)); + + +/* + * DictEntry type represents one entry in the given dictionary. + */ +typedef struct +{ + Oid oid; /* entry Oid */ + uint32 length; /* entry length */ + NameData dictentry; /* the entry */ +} DictEntry; + +/* + * Dictionary type represents all the entries in the given dictionary. + */ +typedef struct +{ + uint32 nentries; /* number of entries */ + DictEntry *entries; /* array of entries */ +} DictionaryData; + +typedef DictionaryData *Dictionary; + +/* + * prototypes for functions in pg_dict.c + */ +extern void DictEntriesCreate(Oid dictTypeOid, List *vals); +extern Dictionary DictEntriesRead(Oid dictTypeOid); +extern void DictEntriesDelete(Oid dictTypeOid); + +#endif /* PG_DICT_H */ diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index babe16f00a..81a020bc2f 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10024,6 +10024,26 @@ proname => 'jsonb_path_match_opr', prorettype => 'bool', proargtypes => 'jsonb jsonpath', prosrc => 'jsonb_path_match_opr' }, +# dictionaries +{ oid => '9864', descr => 'Converts a cstring to a dictionary', + proname => 'dictionary_in', prorettype => 'any', proargtypes => 'cstring oid int4', + prosrc => 'dictionary_in' }, +{ oid => '9865', descr => 'Converts a dictionary to a cstring', + proname => 'dictionary_out', prorettype => 'cstring', proargtypes => 'any', + prosrc => 'dictionary_out' }, +{ oid => '9866', descr => 'Coverts JSONB to a dictionary type', + proname => 'jsonb_dictionary', proisstrict => 'f', provolatile => 's', + prorettype => 'any', proargtypes => 'jsonb int4', + prosrc => 'jsonb_dictionary' }, +{ oid => '9867', descr => 'Converts a dictionary type to JSONB', + proname => 'dictionary_jsonb', proisstrict => 'f', provolatile => 's', + prorettype => 'jsonb', proargtypes => 'any', + prosrc => 'dictionary_jsonb' }, +{ oid => '9868', descr => 'Converts a dictionary type to a byte array', + proname => 'dictionary_bytea', proisstrict => 'f', provolatile => 's', + prorettype => 'jsonb', proargtypes => 'any', + prosrc => 'dictionary_bytea' }, + # historical int8/txid_snapshot variants of xid8 functions { oid => '2939', descr => 'I/O', proname => 'txid_snapshot_in', prorettype => 'txid_snapshot', diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index 48a2559137..91c6bfae99 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -272,6 +272,7 @@ DECLARE_UNIQUE_INDEX(pg_type_typname_nsp_index, 2704, TypeNameNspIndexId, on pg_ */ #define TYPTYPE_BASE 'b' /* base type (ordinary scalar type) */ #define TYPTYPE_COMPOSITE 'c' /* composite (e.g., table's rowtype) */ +#define TYPTYPE_DICT 'D' /* dictionary */ #define TYPTYPE_DOMAIN 'd' /* domain over another type */ #define TYPTYPE_ENUM 'e' /* enumerated type */ #define TYPTYPE_MULTIRANGE 'm' /* multirange type */ diff --git a/src/include/commands/typecmds.h b/src/include/commands/typecmds.h index a17bedb851..5cc8e5ff99 100644 --- a/src/include/commands/typecmds.h +++ b/src/include/commands/typecmds.h @@ -24,6 +24,7 @@ extern ObjectAddress DefineType(ParseState *pstate, List *names, List *parameters); extern void RemoveTypeById(Oid typeOid); extern ObjectAddress DefineDomain(CreateDomainStmt *stmt); +extern ObjectAddress DefineDictionary(CreateDictionaryStmt *stmt); extern ObjectAddress DefineEnum(CreateEnumStmt *stmt); extern ObjectAddress DefineRange(ParseState *pstate, CreateRangeStmt *stmt); extern ObjectAddress AlterEnum(AlterEnumStmt *stmt); diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index b3b407579b..031369cee3 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -408,6 +408,7 @@ typedef enum NodeTag T_DropOwnedStmt, T_ReassignOwnedStmt, T_CompositeTypeStmt, + T_CreateDictionaryStmt, T_CreateEnumStmt, T_CreateRangeStmt, T_AlterEnumStmt, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 73f635b455..4e2e7af20d 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3563,6 +3563,18 @@ typedef struct CompositeTypeStmt List *coldeflist; /* list of ColumnDef nodes */ } CompositeTypeStmt; +/* ---------------------- + * Create Type Statement, dictionary types + * ---------------------- + */ +typedef struct CreateDictionaryStmt +{ + NodeTag type; + List *typeName; /* qualified name (list of String) */ + List *baseTypeName; /* qualified base type name (list of String) */ + List *vals; /* dictionary entries (list of String) */ +} CreateDictionaryStmt; + /* ---------------------- * Create Type Statement, enum types * ---------------------- diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c index b791c23f06..897051c1f7 100644 --- a/src/pl/plpgsql/src/pl_comp.c +++ b/src/pl/plpgsql/src/pl_comp.c @@ -2129,6 +2129,7 @@ build_datatype(HeapTuple typeTup, int32 typmod, switch (typeStruct->typtype) { case TYPTYPE_BASE: + case TYPTYPE_DICT: case TYPTYPE_ENUM: case TYPTYPE_RANGE: case TYPTYPE_MULTIRANGE: diff --git a/src/test/modules/test_oat_hooks/test_oat_hooks.c b/src/test/modules/test_oat_hooks/test_oat_hooks.c index 7ef272cc7a..0dceca435a 100644 --- a/src/test/modules/test_oat_hooks/test_oat_hooks.c +++ b/src/test/modules/test_oat_hooks/test_oat_hooks.c @@ -1380,6 +1380,9 @@ nodetag_to_string(NodeTag tag) case T_CompositeTypeStmt: return "CompositeTypeStmt"; break; + case T_CreateDictionaryStmt: + return "CreateDictionaryStmt"; + break; case T_CreateEnumStmt: return "CreateEnumStmt"; break; diff --git a/src/test/regress/expected/dict.out b/src/test/regress/expected/dict.out new file mode 100644 index 0000000000..699e76b654 --- /dev/null +++ b/src/test/regress/expected/dict.out @@ -0,0 +1,47 @@ +-- +-- Compression dictionaries tests +-- +-- Dictionary types currently can be used only with JSONB +\set ON_ERROR_STOP 0 +CREATE TYPE textdict AS DICTIONARY OF TEXT ('abcdef', 'ghijkl'); +ERROR: Dictionary types currently can be used only with JSONB +\set ON_ERROR_STOP 1 +-- Simple happy path +CREATE TYPE mydict AS DICTIONARY OF JSONB ('abcdef', 'ghijkl'); +SELECT dictentry FROM pg_dict ORDER BY dictentry; + dictentry +----------- + abcdef + ghijkl +(2 rows) + +SELECT '{"abcdef":"ghijkl"}' :: mydict; + mydict +---------------------- + {"abcdef": "ghijkl"} +(1 row) + +SELECT '{"abcdef":"ghijkl"}' :: mydict :: jsonb; + jsonb +---------------------- + {"abcdef": "ghijkl"} +(1 row) + +SELECT '{"abcdef":"ghijkl"}' :: jsonb :: mydict; + mydict +---------------------- + {"abcdef": "ghijkl"} +(1 row) + +SELECT ('{"abcdef":"ghijkl"}' :: mydict) -> 'abcdef'; + ?column? +---------- + "ghijkl" +(1 row) + +DROP TYPE mydict; +SELECT dictentry FROM pg_dict ORDER BY dictentry; + dictentry +----------- +(0 rows) + diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out index 215eb899be..91bc491a6c 100644 --- a/src/test/regress/expected/oidjoins.out +++ b/src/test/regress/expected/oidjoins.out @@ -182,6 +182,7 @@ NOTICE: checking pg_description {classoid} => pg_class {oid} NOTICE: checking pg_cast {castsource} => pg_type {oid} NOTICE: checking pg_cast {casttarget} => pg_type {oid} NOTICE: checking pg_cast {castfunc} => pg_proc {oid} +NOTICE: checking pg_dict {dicttypid} => pg_type {oid} NOTICE: checking pg_enum {enumtypid} => pg_type {oid} NOTICE: checking pg_namespace {nspowner} => pg_authid {oid} NOTICE: checking pg_conversion {connamespace} => pg_namespace {oid} diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index 86d755aa44..5dc4d0ab01 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -388,11 +388,12 @@ WHERE 'cstring'::regtype = ANY (p1.proargtypes) AND NOT EXISTS(SELECT 1 FROM pg_conversion WHERE conproc = p1.oid) AND p1.oid != 'shell_in(cstring)'::regprocedure ORDER BY 1; - oid | proname -------+-------------- + oid | proname +------+--------------- 2293 | cstring_out 2501 | cstring_send -(2 rows) + 9864 | dictionary_in +(3 rows) -- Likewise, look for functions that return cstring and aren't datatype output -- functions nor typmod output functions. @@ -405,11 +406,12 @@ WHERE p1.prorettype = 'cstring'::regtype AND NOT EXISTS(SELECT 1 FROM pg_type WHERE typmodout = p1.oid) AND p1.oid != 'shell_out(void)'::regprocedure ORDER BY 1; - oid | proname -------+-------------- + oid | proname +------+---------------- 2292 | cstring_in 2500 | cstring_recv -(2 rows) + 9865 | dictionary_out +(3 rows) -- Check for length inconsistencies between the various argument-info arrays. SELECT p1.oid, p1.proname diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 103e11483d..2fa15a2080 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -111,7 +111,7 @@ test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combo # ---------- # Another group of parallel tests (JSON related) # ---------- -test: json jsonb json_encoding jsonpath jsonpath_encoding jsonb_jsonpath sqljson json_sqljson jsonb_sqljson +test: dict json jsonb json_encoding jsonpath jsonpath_encoding jsonb_jsonpath sqljson json_sqljson jsonb_sqljson # ---------- # Another group of parallel tests diff --git a/src/test/regress/sql/dict.sql b/src/test/regress/sql/dict.sql new file mode 100644 index 0000000000..27ecd6a451 --- /dev/null +++ b/src/test/regress/sql/dict.sql @@ -0,0 +1,20 @@ +-- +-- Compression dictionaries tests +-- + +-- Dictionary types currently can be used only with JSONB +\set ON_ERROR_STOP 0 +CREATE TYPE textdict AS DICTIONARY OF TEXT ('abcdef', 'ghijkl'); +\set ON_ERROR_STOP 1 + +-- Simple happy path +CREATE TYPE mydict AS DICTIONARY OF JSONB ('abcdef', 'ghijkl'); +SELECT dictentry FROM pg_dict ORDER BY dictentry; + +SELECT '{"abcdef":"ghijkl"}' :: mydict; +SELECT '{"abcdef":"ghijkl"}' :: mydict :: jsonb; +SELECT '{"abcdef":"ghijkl"}' :: jsonb :: mydict; +SELECT ('{"abcdef":"ghijkl"}' :: mydict) -> 'abcdef'; + +DROP TYPE mydict; +SELECT dictentry FROM pg_dict ORDER BY dictentry; -- 2.35.1