Thread: TODO Item - Return compressed length of TOAST datatypes (WIP)
I thought I would have a look at: (Datatypes) Add function to return compressed length of TOAST data values. A WIP patch is attached for comment (wanted to check I hadn't bitten off more than I could chew *before* asking questions!). A few questions come to mind: 1) The name - I have called it 'toast_compressed_length'. Seems longish - I'm wondering if just 'compressed_length' is ok? 2) What should be returned for toasted data that is not compressed (or plain stored data for that matter)? The WIP patch just gives the uncompressed size (I notice I may need to subtract VARHDRSZ in some cases). 3) What should be returned for non-varlena types? The WIP patch is treating everything as a varlena, so is returning incorrect information for that case. 4) The builtin is declared as immutable - I am not so sure about that (I am wondering if altering a column's storage from MAIN -> EXTENDED and then updating the column to be itself will fool it). 5) Any multi-byte locale considerations? regards Mark diff -Nacr src/include/catalog/pg_proc.h.orig src/include/catalog/pg_proc.h *** src/include/catalog/pg_proc.h.orig Fri Jun 17 15:30:17 2005 --- src/include/catalog/pg_proc.h Fri Jun 17 17:08:18 2005 *************** *** 3655,3660 **** --- 3655,3664 ---- DATA(insert OID = 2560 ( pg_postmaster_start_time PGNSP PGUID 12 f f t f s 0 1184 "" _null_ _null_ _null_ pgsql_postmaster_start_time- _null_ )); DESCR("postmaster start time"); + /* Toast compressed length */ + DATA(insert OID = 2561 ( toast_compressed_length PGNSP PGUID 12 f f t f i 1 23 "25" _null_ _null_ _null_ toast_compressed_length- _null_ )); + DESCR("toast compressed length"); + /* * Symbolic values for provolatile column: these indicate whether the result diff -Nacr src/include/access/tuptoaster.h.orig src/include/access/tuptoaster.h *** src/include/access/tuptoaster.h.orig Thu Jun 16 21:12:57 2005 --- src/include/access/tuptoaster.h Thu Jun 16 21:14:06 2005 *************** *** 138,141 **** --- 138,149 ---- */ extern Size toast_raw_datum_size(Datum value); + /* ---------- + * toast_compressed_datum_size - + * + * Return the compressed (toasted) size of a varlena datum + * ---------- + */ + extern Size toast_compressed_datum_size(Datum value); + #endif /* TUPTOASTER_H */ diff -Nacr src/include/utils/pg_lzcompress.h.orig src/include/utils/pg_lzcompress.h *** src/include/utils/pg_lzcompress.h.orig Thu Jun 16 21:21:37 2005 --- src/include/utils/pg_lzcompress.h Thu Jun 16 21:21:11 2005 *************** *** 228,231 **** --- 228,238 ---- extern int pglz_get_next_decomp_char_from_lzdata(PGLZ_DecompState *dstate); extern int pglz_get_next_decomp_char_from_plain(PGLZ_DecompState *dstate); + /* ---------- + * Function to get compressed size. + * Internal use only. + * ---------- + */ + extern int pglz_fetch_size(PGLZ_Header *source); + #endif /* _PG_LZCOMPRESS_H_ */ diff -Nacr src/include/utils/builtins.h.orig src/include/utils/builtins.h *** src/include/utils/builtins.h.orig Fri Jun 17 15:25:01 2005 --- src/include/utils/builtins.h Fri Jun 17 15:27:30 2005 *************** *** 828,831 **** --- 828,834 ---- /* catalog/pg_conversion.c */ extern Datum pg_convert_using(PG_FUNCTION_ARGS); + /* toastfuncs.c */ + Datum toast_compressed_length(PG_FUNCTION_ARGS); + #endif /* BUILTINS_H */ diff -Nacr src/backend/access/heap/tuptoaster.c.orig src/backend/access/heap/tuptoaster.c *** src/backend/access/heap/tuptoaster.c.orig Thu Jun 16 20:56:59 2005 --- src/backend/access/heap/tuptoaster.c Fri Jun 17 15:12:30 2005 *************** *** 1436,1438 **** --- 1436,1499 ---- return result; } + + /* ---------- + * toast_compressed_datum_size + * + * Show the compressed size of a datum + * ---------- + */ + Size + toast_compressed_datum_size(Datum value) + { + + + Size size; + varattrib *attr = (varattrib *) DatumGetPointer(value); + + if (!PointerIsValid(attr)) + { + /* + * No storage or NULL. + */ + size = 0; + } + else if (VARATT_IS_EXTERNAL(attr)) + { + /* + * Attribute is stored externally + * If it is compressed too, then we need to get the external datum + * and interrogate *its* compressed size + * otherwise just use the external rawsize (i.e. no compression) + */ + if (VARATT_IS_COMPRESSED(attr)) + { + varattrib *attrext = toast_fetch_datum(attr); + size = pglz_fetch_size((PGLZ_Header *)attrext); + pfree(attrext); + } + else + { + + size = attr->va_content.va_external.va_rawsize; + } + } + else if (VARATT_IS_COMPRESSED(attr)) + { + /* + * Attribute is stored compressed inline, so calculate + * compressed size on the datum itself. + */ + size = pglz_fetch_size((PGLZ_Header *)attr); + } + else + { + /* + * Attribute is stored inline, no compression. + */ + size = VARSIZE(attr); + } + + return size; + + } diff -Nacr src/backend/utils/adt/Makefile.orig src/backend/utils/adt/Makefile *** src/backend/utils/adt/Makefile.orig Fri Jun 17 15:26:44 2005 --- src/backend/utils/adt/Makefile Fri Jun 17 16:39:04 2005 *************** *** 24,30 **** tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ ! ascii.o quote.o pgstatfuncs.o encode.o like.o: like.c like_match.c --- 24,30 ---- tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ ! ascii.o quote.o pgstatfuncs.o encode.o toastfuncs.o like.o: like.c like_match.c diff -Nacr src/backend/utils/adt/pg_lzcompress.c.orig src/backend/utils/adt/pg_lzcompress.c *** src/backend/utils/adt/pg_lzcompress.c.orig Thu Jun 16 21:14:42 2005 --- src/backend/utils/adt/pg_lzcompress.c Fri Jun 17 16:30:49 2005 *************** *** 904,907 **** --- 899,930 ---- return EOF; return (int) (*(dstate->cp_in++)); + } + + /* ---------- + * pglz_fetch_size - + * + * Actual calculation to get the compressed size. + * + * ---------- + */ + int + pglz_fetch_size(PGLZ_Header *source) + { + + int size; + + + if (VARATT_SIZE(source) == source->rawsize + sizeof(PGLZ_Header)) + { + /* Compression was not attempted or not effective.*/ + size = source->rawsize; + } + else + { + /* Compressed attribute. */ + size = VARATT_SIZE(source) - sizeof(PGLZ_Header); + } + + return size; } diff -Nacr src/backend/utils/adt/toastfuncs.c.orig src/backend/utils/adt/toastfuncs.c *** src/backend/utils/adt/toastfuncs.c.orig Fri Jun 17 17:29:34 2005 --- src/backend/utils/adt/toastfuncs.c Fri Jun 17 16:39:18 2005 *************** *** 0 **** --- 1,36 ---- + /*------------------------------------------------------------------------- + * + * toastfuncs.c + * Functions for accessing information about toasted data. + * + * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + + #include "postgres.h" + #include "fmgr.h" + + #include "access/xact.h" + #include "access/tuptoaster.h" + #include "utils/builtins.h" + #include "utils/pg_lzcompress.h" + + Datum + toast_compressed_length(PG_FUNCTION_ARGS) + { + + Datum value = PG_GETARG_DATUM(0); + int size; + + + size = toast_compressed_datum_size(value); + + PG_RETURN_INT32(size); + } +
Mark Kirkwood <markir@paradise.net.nz> writes: > I thought I would have a look at: > (Datatypes) Add function to return compressed length of TOAST data values. My recollection of that discussion is that we just wanted something that would return the actual VARSIZE() of the datum. You're building something way too confusing ... A more interesting point is that the way you've declared the function, it will only work on text values, which is pretty limiting. Ideally we'd define this thing as "pg_datum_length(any-varlena-type) returns int" but there is no pseudotype corresponding to "any varlena type". I was thinking about this the other day in connection with my proposal to make something that could return the TOAST value OID of an out-of-line datum. I think the only non-restrictive way to do it would be to declare the function as taking "any", and then add a runtime check using the get_fn_expr_argtype() mechanism to test that what you've been given is in fact a varlena datatype. regards, tom lane
Tom Lane wrote: > Mark Kirkwood <markir@paradise.net.nz> writes: > >>I thought I would have a look at: >>(Datatypes) Add function to return compressed length of TOAST data values. > > > My recollection of that discussion is that we just wanted something > that would return the actual VARSIZE() of the datum. You're building > something way too confusing ... > I was guessing a little about exactly what was wanted - for some reason I couldn't access the mail archives for the last few days (however, can now). > A more interesting point is that the way you've declared the function, > it will only work on text values, which is pretty limiting. Ideally > we'd define this thing as "pg_datum_length(any-varlena-type) returns int" > but there is no pseudotype corresponding to "any varlena type". > > I was thinking about this the other day in connection with my proposal > to make something that could return the TOAST value OID of an > out-of-line datum. I think the only non-restrictive way to do it would > be to declare the function as taking "any", and then add a runtime check > using the get_fn_expr_argtype() mechanism to test that what you've been > given is in fact a varlena datatype. > Yes - was thinking I needed to check if the Datum was a varlena (and didn't know how to do it...), so thanks for the pointer! With these thoughts in mind, I'll have another go :-) Cheers Mark
The next iteration - Hopefully I have got the idea basically right. I wonder if I have done the "am I a varlena" the long way.., pls advise if so! Cheers Mark Tom Lane wrote: > > My recollection of that discussion is that we just wanted something > that would return the actual VARSIZE() of the datum. You're building > something way too confusing ... > > A more interesting point is that the way you've declared the function, > it will only work on text values, which is pretty limiting. Ideally > we'd define this thing as "pg_datum_length(any-varlena-type) returns int" > but there is no pseudotype corresponding to "any varlena type". > > I was thinking about this the other day in connection with my proposal > to make something that could return the TOAST value OID of an > out-of-line datum. I think the only non-restrictive way to do it would > be to declare the function as taking "any", and then add a runtime check > using the get_fn_expr_argtype() mechanism to test that what you've been > given is in fact a varlena datatype. > > diff -Nacr ./src/backend/access/heap/tuptoaster.c.orig ./src/backend/access/heap/tuptoaster.c *** ./src/backend/access/heap/tuptoaster.c.orig Mon Mar 21 13:23:58 2005 --- ./src/backend/access/heap/tuptoaster.c Sun Jun 19 14:24:43 2005 *************** *** 1436,1438 **** --- 1436,1482 ---- return result; } + + /* ---------- + * toast_datum_size + * + * Show the (possibly compressed) size of a datum + * ---------- + */ + Size + toast_datum_size(Datum value) + { + + varattrib *attr = (varattrib *) DatumGetPointer(value); + Size result; + + if (VARATT_IS_EXTERNAL(attr)) + { + /* + * Attribute is stored externally - If it is compressed too, + * then we need to get the external datum and calculate its size, + * otherwise we just use the external rawsize. + */ + if (VARATT_IS_COMPRESSED(attr)) + { + varattrib *attrext = toast_fetch_datum(attr); + result = VARSIZE(attrext); + pfree(attrext); + } + else + { + result = attr->va_content.va_external.va_rawsize; + } + } + else + { + /* + * Attribute is stored inline either compressed or not, just + * calculate the size of the datum in either case. + */ + result = VARSIZE(attr); + } + + return result; + + } diff -Nacr ./src/backend/utils/adt/Makefile.orig ./src/backend/utils/adt/Makefile *** ./src/backend/utils/adt/Makefile.orig Fri Apr 2 09:28:45 2004 --- ./src/backend/utils/adt/Makefile Fri Jun 17 17:52:09 2005 *************** *** 24,30 **** tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ ! ascii.o quote.o pgstatfuncs.o encode.o like.o: like.c like_match.c --- 24,30 ---- tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ ! ascii.o quote.o pgstatfuncs.o encode.o toastfuncs.o like.o: like.c like_match.c diff -Nacr ./src/backend/utils/adt/toastfuncs.c.orig ./src/backend/utils/adt/toastfuncs.c *** ./src/backend/utils/adt/toastfuncs.c.orig Fri Jun 17 17:52:09 2005 --- ./src/backend/utils/adt/toastfuncs.c Sun Jun 19 17:35:26 2005 *************** *** 0 **** --- 1,74 ---- + /*------------------------------------------------------------------------- + * + * toastfuncs.c + * Functions for accessing information about toasted data. + * + * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + + #include "postgres.h" + #include "fmgr.h" + + #include "catalog/pg_type.h" + #include "access/xact.h" + #include "access/tuptoaster.h" + #include "utils/builtins.h" + #include "utils/syscache.h" + + Datum + pg_datum_length(PG_FUNCTION_ARGS) + { + + Datum value = PG_GETARG_DATUM(0); + int size; + + + if (fcinfo->flinfo->fn_extra == NULL) + { + /* + * On the first call check lookup the datatype of the supplied argument + * and check if is a varlena . + */ + Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); + HeapTuple tp; + int typlen; + + fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + sizeof(int)); + + tp = SearchSysCache(TYPEOID, + ObjectIdGetDatum(argtypeid), + 0, 0, 0); + if (HeapTupleIsValid(tp)) + { + Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tp); + typlen = typtup->typlen; + ReleaseSysCache(tp); + } + else + { + /* Oid not in pg_type, should never happen. */ + elog(ERROR, "invalid typid %u", argtypeid); + } + + + + if ( typlen != -1 ) + { + elog(ERROR, "this function is only applicable to varlena types"); + } + } + + + size = toast_datum_size(value) - VARHDRSZ; + + PG_RETURN_INT32(size); + } + diff -Nacr ./src/include/access/tuptoaster.h.orig ./src/include/access/tuptoaster.h *** ./src/include/access/tuptoaster.h.orig Mon Mar 21 13:24:04 2005 --- ./src/include/access/tuptoaster.h Sun Jun 19 14:45:18 2005 *************** *** 138,141 **** --- 138,149 ---- */ extern Size toast_raw_datum_size(Datum value); + /* ---------- + * toast_datum_size - + * + * Return the (possibly compressed) size of a varlena datum + * ---------- + */ + extern Size toast_datum_size(Datum value); + #endif /* TUPTOASTER_H */ diff -Nacr ./src/include/catalog/pg_proc.h.orig ./src/include/catalog/pg_proc.h *** ./src/include/catalog/pg_proc.h.orig Wed Jun 15 09:04:41 2005 --- ./src/include/catalog/pg_proc.h Sun Jun 19 17:08:35 2005 *************** *** 3655,3660 **** --- 3655,3664 ---- DATA(insert OID = 2560 ( pg_postmaster_start_time PGNSP PGUID 12 f f t f s 0 1184 "" _null_ _null_ _null_ pgsql_postmaster_start_time- _null_ )); DESCR("postmaster start time"); + /* Toast compressed length */ + DATA(insert OID = 2561 ( pg_datum_length PGNSP PGUID 12 f f t f i 1 23 "2276" _null_ _null_ _null_ pg_datum_length- _null_ )); + DESCR("length (possibly compressed) of varlena types"); + /* * Symbolic values for provolatile column: these indicate whether the result diff -Nacr ./src/include/utils/builtins.h.orig ./src/include/utils/builtins.h *** ./src/include/utils/builtins.h.orig Fri May 27 12:57:49 2005 --- ./src/include/utils/builtins.h Sun Jun 19 12:48:35 2005 *************** *** 828,831 **** --- 828,834 ---- /* catalog/pg_conversion.c */ extern Datum pg_convert_using(PG_FUNCTION_ARGS); + /* toastfuncs.c */ + extern Datum pg_datum_length(PG_FUNCTION_ARGS); + #endif /* BUILTINS_H */
I did a few cleanups on the last patch. Please examine this one instead. The changes are: 1. Add documentation for pg_datum_length builtin. 2. Correct some typos in the code comments. 3. Move the code in toastfuncs.c to varlena.c as it is probably the correct place. 4. Use ereport instead of elog. 5 Quiet compiler warning in pg_datum_length. Best wishes Mark Mark Kirkwood wrote: > The next iteration - > > Hopefully I have got the idea basically right. > > I wonder if I have done the "am I a varlena" the long way.., pls advise > if so! > diff -Nacr ./doc/src/sgml/func.sgml.orig ./doc/src/sgml/func.sgml *** ./doc/src/sgml/func.sgml.orig Mon Jun 20 15:38:23 2005 --- ./doc/src/sgml/func.sgml Mon Jun 20 15:45:51 2005 *************** *** 2187,2192 **** --- 2187,2200 ---- </row> <row> + <entry><literal><function>pg_datum_length</function>(<parameter>string</parameter>)</literal></entry> + <entry><type>integer</type></entry> + <entry>Number of bytes (before toast decompression) in string</entry> + <entry><literal>pg_datum_length( 'jo\\000se'::bytea)</literal></entry> + <entry><literal>5</literal></entry> + </row> + + <row> <entry><literal><function>position</function>(<parameter>substring</parameter> in <parameter>string</parameter>)</literal></entry> <entry><type>integer</type></entry> <entry>Location of specified substring</entry> diff -Nacr ./src/backend/access/heap/tuptoaster.c.orig ./src/backend/access/heap/tuptoaster.c *** ./src/backend/access/heap/tuptoaster.c.orig Mon Jun 20 17:11:37 2005 --- ./src/backend/access/heap/tuptoaster.c Mon Jun 20 17:11:44 2005 *************** *** 1436,1438 **** --- 1436,1482 ---- return result; } + + /* ---------- + * toast_datum_size + * + * Show the (possibly compressed) size of a datum + * ---------- + */ + Size + toast_datum_size(Datum value) + { + + varattrib *attr = (varattrib *) DatumGetPointer(value); + Size result; + + if (VARATT_IS_EXTERNAL(attr)) + { + /* + * Attribute is stored externally - If it is compressed too, + * then we need to get the external datum and calculate its size, + * otherwise we just use the external rawsize. + */ + if (VARATT_IS_COMPRESSED(attr)) + { + varattrib *attrext = toast_fetch_datum(attr); + result = VARSIZE(attrext); + pfree(attrext); + } + else + { + result = attr->va_content.va_external.va_rawsize; + } + } + else + { + /* + * Attribute is stored inline either compressed or not, just + * calculate the size of the datum in either case. + */ + result = VARSIZE(attr); + } + + return result; + + } diff -Nacr ./src/backend/utils/adt/varlena.c.orig ./src/backend/utils/adt/varlena.c *** ./src/backend/utils/adt/varlena.c.orig Mon Jun 20 14:28:03 2005 --- ./src/backend/utils/adt/varlena.c Mon Jun 20 17:17:58 2005 *************** *** 28,33 **** --- 28,34 ---- #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" + #include "utils/syscache.h" typedef struct varlena unknown; *************** *** 2330,2333 **** --- 2331,2396 ---- result_text = PG_STR_GET_TEXT(hexsum); PG_RETURN_TEXT_P(result_text); + } + + /* + * Show the (possibly compressed) length of a datum. + */ + Datum + pg_datum_length(PG_FUNCTION_ARGS) + { + + Datum value = PG_GETARG_DATUM(0); + int result; + + + if (fcinfo->flinfo->fn_extra == NULL) + { + /* + * On the first call lookup the datatype of the supplied argument + * and check if is a varlena. + */ + Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); + HeapTuple tp; + int typlen = 0; + + + tp = SearchSysCache(TYPEOID, + ObjectIdGetDatum(argtypeid), + 0, 0, 0); + if (HeapTupleIsValid(tp)) + { + Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tp); + typlen = typtup->typlen; + ReleaseSysCache(tp); + } + else + { + /* Oid not in pg_type, should never happen. */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("invalid typid: %u", argtypeid))); + } + + + if ( typlen != -1 ) + { + /* Not a varlena. */ + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("function is only applicable to varlena types"))); + } + + + /* + * Allocate fn_extra so we don't go here again! + */ + fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + sizeof(int)); + } + + + result = toast_datum_size(value) - VARHDRSZ; + + PG_RETURN_INT32(result); } diff -Nacr ./src/include/access/tuptoaster.h.orig ./src/include/access/tuptoaster.h *** ./src/include/access/tuptoaster.h.orig Mon Mar 21 13:24:04 2005 --- ./src/include/access/tuptoaster.h Sun Jun 19 14:45:18 2005 *************** *** 138,141 **** --- 138,149 ---- */ extern Size toast_raw_datum_size(Datum value); + /* ---------- + * toast_datum_size - + * + * Return the (possibly compressed) size of a varlena datum + * ---------- + */ + extern Size toast_datum_size(Datum value); + #endif /* TUPTOASTER_H */ diff -Nacr ./src/include/catalog/pg_proc.h.orig ./src/include/catalog/pg_proc.h *** ./src/include/catalog/pg_proc.h.orig Wed Jun 15 09:04:41 2005 --- ./src/include/catalog/pg_proc.h Sun Jun 19 17:08:35 2005 *************** *** 3655,3660 **** --- 3655,3664 ---- DATA(insert OID = 2560 ( pg_postmaster_start_time PGNSP PGUID 12 f f t f s 0 1184 "" _null_ _null_ _null_ pgsql_postmaster_start_time- _null_ )); DESCR("postmaster start time"); + /* Toast compressed length */ + DATA(insert OID = 2561 ( pg_datum_length PGNSP PGUID 12 f f t f i 1 23 "2276" _null_ _null_ _null_ pg_datum_length- _null_ )); + DESCR("length (possibly compressed) of varlena types"); + /* * Symbolic values for provolatile column: these indicate whether the result diff -Nacr ./src/include/utils/builtins.h.orig ./src/include/utils/builtins.h *** ./src/include/utils/builtins.h.orig Fri May 27 12:57:49 2005 --- ./src/include/utils/builtins.h Mon Jun 20 15:59:09 2005 *************** *** 601,606 **** --- 601,607 ---- extern Datum byteapos(PG_FUNCTION_ARGS); extern Datum bytea_substr(PG_FUNCTION_ARGS); extern Datum bytea_substr_no_len(PG_FUNCTION_ARGS); + extern Datum pg_datum_length(PG_FUNCTION_ARGS); /* version.c */ extern Datum pgsql_version(PG_FUNCTION_ARGS);
Mark Kirkwood wrote: > I did a few cleanups on the last patch. Please examine this one instead. > The changes are: > > 1. Add documentation for pg_datum_length builtin. > 2. Correct some typos in the code comments. > 3. Move the code in toastfuncs.c to varlena.c as it is probably the > correct place. > 4. Use ereport instead of elog. > 5 Quiet compiler warning in pg_datum_length. I have modified your patch to simplify the logic, and renamed it to pg_column_size(), to be consistent with our soon-to-be-added pg_relation/tablespace/database functions from dbsize. Here is a sample usage: test=> CREATE TABLE test (x INT, y TEXT); CREATE TABLE test=> INSERT INTO test VALUES (4, repeat('x', 10000)); INSERT 0 1 test=> INSERT INTO test VALUES (4, repeat('x', 100000)); INSERT 0 1 test=> SELECT pg_column_size(x), pg_column_size(y) FROM test; pg_column_size | pg_column_size ----------------+---------------- 4 | 121 4 | 1152 (2 rows) Interesting the 10-times larger column is 10-times larger in storage. Do we have some limit on how many repeated values we can record? Patch attached and applied. -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073 Index: doc/src/sgml/func.sgml =================================================================== RCS file: /cvsroot/pgsql/doc/src/sgml/func.sgml,v retrieving revision 1.262 diff -c -c -r1.262 func.sgml *** doc/src/sgml/func.sgml 29 Jun 2005 01:52:56 -0000 1.262 --- doc/src/sgml/func.sgml 6 Jul 2005 18:55:34 -0000 *************** *** 2187,2192 **** --- 2187,2200 ---- </row> <row> + <entry><literal><function>pg_column_size</function>(<parameter>string</parameter>)</literal></entry> + <entry><type>integer</type></entry> + <entry>Number of bytes required to store the value, which might be compressed</entry> + <entry><literal>pg_column_size('jo\\000se'::bytea)</literal></entry> + <entry><literal>5</literal></entry> + </row> + + <row> <entry><literal><function>position</function>(<parameter>substring</parameter> in <parameter>string</parameter>)</literal></entry> <entry><type>integer</type></entry> <entry>Location of specified substring</entry> Index: src/backend/access/heap/tuptoaster.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/access/heap/tuptoaster.c,v retrieving revision 1.49 diff -c -c -r1.49 tuptoaster.c *** src/backend/access/heap/tuptoaster.c 21 Mar 2005 01:23:58 -0000 1.49 --- src/backend/access/heap/tuptoaster.c 6 Jul 2005 18:55:35 -0000 *************** *** 1436,1438 **** --- 1436,1480 ---- return result; } + + /* ---------- + * toast_datum_size + * + * Show the (possibly compressed) size of a datum + * ---------- + */ + Size + toast_datum_size(Datum value) + { + + varattrib *attr = (varattrib *) DatumGetPointer(value); + Size result; + + if (VARATT_IS_EXTERNAL(attr)) + { + /* + * Attribute is stored externally - If it is compressed too, + * then we need to get the external datum and calculate its size, + * otherwise we just use the external rawsize. + */ + if (VARATT_IS_COMPRESSED(attr)) + { + varattrib *attrext = toast_fetch_datum(attr); + result = VARSIZE(attrext); + pfree(attrext); + } + else + result = attr->va_content.va_external.va_rawsize; + } + else + { + /* + * Attribute is stored inline either compressed or not, just + * calculate the size of the datum in either case. + */ + result = VARSIZE(attr); + } + + return result; + + } Index: src/backend/utils/adt/varlena.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/varlena.c,v retrieving revision 1.124 diff -c -c -r1.124 varlena.c *** src/backend/utils/adt/varlena.c 4 Jul 2005 18:56:44 -0000 1.124 --- src/backend/utils/adt/varlena.c 6 Jul 2005 18:55:36 -0000 *************** *** 28,33 **** --- 28,34 ---- #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" + #include "utils/syscache.h" typedef struct varlena unknown; *************** *** 2348,2350 **** --- 2349,2395 ---- result_text = PG_STR_GET_TEXT(hexsum); PG_RETURN_TEXT_P(result_text); } + + /* + * Return the length of a datum, possibly compressed + */ + Datum + pg_column_size(PG_FUNCTION_ARGS) + { + Datum value = PG_GETARG_DATUM(0); + int result; + + /* fn_extra stores the fixed column length, or -1 for varlena. */ + if (fcinfo->flinfo->fn_extra == NULL) /* first call? */ + { + /* On the first call lookup the datatype of the supplied argument */ + Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); + HeapTuple tp; + int typlen; + + tp = SearchSysCache(TYPEOID, + ObjectIdGetDatum(argtypeid), + 0, 0, 0); + if (!HeapTupleIsValid(tp)) + { + /* Oid not in pg_type, should never happen. */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("invalid typid: %u", argtypeid))); + } + + typlen = ((Form_pg_type)GETSTRUCT(tp))->typlen; + ReleaseSysCache(tp); + fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + sizeof(int)); + *(int *)fcinfo->flinfo->fn_extra = typlen; + } + + if (*(int *)fcinfo->flinfo->fn_extra != -1) + PG_RETURN_INT32(*(int *)fcinfo->flinfo->fn_extra); + else + { + result = toast_datum_size(value) - VARHDRSZ; + PG_RETURN_INT32(result); + } + } Index: src/include/access/tuptoaster.h =================================================================== RCS file: /cvsroot/pgsql/src/include/access/tuptoaster.h,v retrieving revision 1.22 diff -c -c -r1.22 tuptoaster.h *** src/include/access/tuptoaster.h 21 Mar 2005 01:24:04 -0000 1.22 --- src/include/access/tuptoaster.h 6 Jul 2005 18:55:37 -0000 *************** *** 138,141 **** --- 138,149 ---- */ extern Size toast_raw_datum_size(Datum value); + /* ---------- + * toast_datum_size - + * + * Return the storage size of a varlena datum + * ---------- + */ + extern Size toast_datum_size(Datum value); + #endif /* TUPTOASTER_H */ Index: src/include/catalog/pg_proc.h =================================================================== RCS file: /cvsroot/pgsql/src/include/catalog/pg_proc.h,v retrieving revision 1.373 diff -c -c -r1.373 pg_proc.h *** src/include/catalog/pg_proc.h 1 Jul 2005 19:19:03 -0000 1.373 --- src/include/catalog/pg_proc.h 6 Jul 2005 18:55:43 -0000 *************** *** 3658,3663 **** --- 3658,3667 ---- DATA(insert OID = 2560 ( pg_postmaster_start_time PGNSP PGUID 12 f f t f s 0 1184 "" _null_ _null_ _null_ pgsql_postmaster_start_time- _null_ )); DESCR("postmaster start time"); + /* Column storage size */ + DATA(insert OID = 1269 ( pg_column_size PGNSP PGUID 12 f f t f i 1 23 "2276" _null_ _null_ _null_ pg_column_size- _null_ )); + DESCR("bytes required to store the value, perhaps with compression"); + /* new functions for Y-direction rtree opclasses */ DATA(insert OID = 2562 ( box_below PGNSP PGUID 12 f f t f i 2 16 "603 603" _null_ _null_ _null_ box_below- _null_ )); DESCR("is below"); Index: src/include/utils/builtins.h =================================================================== RCS file: /cvsroot/pgsql/src/include/utils/builtins.h,v retrieving revision 1.258 diff -c -c -r1.258 builtins.h *** src/include/utils/builtins.h 17 Jun 2005 22:32:50 -0000 1.258 --- src/include/utils/builtins.h 6 Jul 2005 18:55:43 -0000 *************** *** 601,606 **** --- 601,607 ---- extern Datum byteapos(PG_FUNCTION_ARGS); extern Datum bytea_substr(PG_FUNCTION_ARGS); extern Datum bytea_substr_no_len(PG_FUNCTION_ARGS); + extern Datum pg_column_size(PG_FUNCTION_ARGS); /* version.c */ extern Datum pgsql_version(PG_FUNCTION_ARGS);
Bruce Momjian wrote: > + /* > + * Return the length of a datum, possibly compressed > + */ > + Datum > + pg_column_size(PG_FUNCTION_ARGS) > + { > + Datum value = PG_GETARG_DATUM(0); > + int result; > + > + /* fn_extra stores the fixed column length, or -1 for varlena. */ > + if (fcinfo->flinfo->fn_extra == NULL) /* first call? */ > + { > + /* On the first call lookup the datatype of the supplied argument */ [...] Is this optimization worth the code complexity? > + tp = SearchSysCache(TYPEOID, > + ObjectIdGetDatum(argtypeid), > + 0, 0, 0); > + if (!HeapTupleIsValid(tp)) > + { > + /* Oid not in pg_type, should never happen. */ > + ereport(ERROR, > + (errcode(ERRCODE_INTERNAL_ERROR), > + errmsg("invalid typid: %u", argtypeid))); > + } elog(ERROR) is usually used for "can't happen" errors; also, the usual error message in this scenario is "cache lookup failed [...]". Perhaps better to use get_typlen() here, anyway. -Neil
Neil Conway wrote: > Bruce Momjian wrote: > >> + /* + * Return the length of a datum, possibly compressed >> + */ >> + Datum >> + pg_column_size(PG_FUNCTION_ARGS) >> + { >> + Datum value = PG_GETARG_DATUM(0); >> + int result; >> + + /* fn_extra stores the fixed column length, or -1 for >> varlena. */ >> + if (fcinfo->flinfo->fn_extra == NULL) /* first call? */ >> + { >> + /* On the first call lookup the datatype of the supplied >> argument */ > > > [...] > > Is this optimization worth the code complexity? > I didn't performance test it, but the idea of hammering the catalogs for each value to be processed seemed a bad thing. >> + tp = SearchSysCache(TYPEOID, >> + ObjectIdGetDatum(argtypeid), >> + 0, 0, 0); >> + if (!HeapTupleIsValid(tp)) >> + { >> + /* Oid not in pg_type, should never happen. */ >> + ereport(ERROR, >> + (errcode(ERRCODE_INTERNAL_ERROR), >> + errmsg("invalid typid: %u", argtypeid))); >> + } > > > elog(ERROR) is usually used for "can't happen" errors; also, the usual > error message in this scenario is "cache lookup failed [...]". Perhaps > better to use get_typlen() here, anyway. > Ahem,..ah yes, get_typlen()! (ducks)- that is clearly much better! I have attached a little change to varlena.c that uses it. I left the ereport as it was, but am not fussed about it either way. BTW - Bruce, I like the changes, makes the beast more friendly to use! Best wishes Mark Index: src/backend/utils/adt/varlena.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/adt/varlena.c,v retrieving revision 1.125 diff -c -r1.125 varlena.c *** src/backend/utils/adt/varlena.c 6 Jul 2005 19:02:52 -0000 1.125 --- src/backend/utils/adt/varlena.c 7 Jul 2005 02:52:50 -0000 *************** *** 28,34 **** #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" - #include "utils/syscache.h" typedef struct varlena unknown; --- 28,33 ---- *************** *** 2364,2385 **** { /* On the first call lookup the datatype of the supplied argument */ Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); ! HeapTuple tp; ! int typlen; ! tp = SearchSysCache(TYPEOID, ! ObjectIdGetDatum(argtypeid), ! 0, 0, 0); ! if (!HeapTupleIsValid(tp)) { /* Oid not in pg_type, should never happen. */ ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), ! errmsg("invalid typid: %u", argtypeid))); } ! ! typlen = ((Form_pg_type)GETSTRUCT(tp))->typlen; ! ReleaseSysCache(tp); fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, sizeof(int)); *(int *)fcinfo->flinfo->fn_extra = typlen; --- 2363,2379 ---- { /* On the first call lookup the datatype of the supplied argument */ Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); ! int typlen = get_typlen(argtypeid); ! ! if (typlen == 0) { /* Oid not in pg_type, should never happen. */ ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), ! errmsg("cache lookup failed for type %u", argtypeid))); } ! fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, sizeof(int)); *(int *)fcinfo->flinfo->fn_extra = typlen;
Mark Kirkwood wrote: > I didn't performance test it, but the idea of hammering the catalogs for > each value to be processed seemed a bad thing. Well, the syscache already sits in front of the catalogs themselves. I'd be curious to see what the performance difference actually is... -Neil
On Thu, Jul 07, 2005 at 03:01:46PM +1200, Mark Kirkwood wrote: > Neil Conway wrote: > >elog(ERROR) is usually used for "can't happen" errors; also, the usual > >error message in this scenario is "cache lookup failed [...]". Perhaps > >better to use get_typlen() here, anyway. > > I have attached a little change to varlena.c that uses it. I left the > ereport as it was, but am not fussed about it either way. I am, because it gives useless messages to the translators to work on. elog parameters are not marked for translation, ereport are (errmsg and friends, really). So please don't do that. -- Alvaro Herrera (<alvherre[a]alvh.no-ip.org>) "¿Cómo puedes confiar en algo que pagas y que no ves, y no confiar en algo que te dan y te lo muestran?" (Germán Poo)
Alvaro Herrera wrote: > On Thu, Jul 07, 2005 at 03:01:46PM +1200, Mark Kirkwood wrote: > >>Neil Conway wrote: > > >>>elog(ERROR) is usually used for "can't happen" errors. >> >>I have attached a little change to varlena.c that uses it. I left the >>ereport as it was, but am not fussed about it either way. > > > I am, because it gives useless messages to the translators to work on. > elog parameters are not marked for translation, ereport are (errmsg and > friends, really). So please don't do that. > Ok, didn't realize the difference! Revised patch attached that uses elog. Neil, I will runs some tests to see if there is any performance saving with the first-call-only business. Cheers Mark Index: src/backend/utils/adt/varlena.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/adt/varlena.c,v retrieving revision 1.125 diff -c -r1.125 varlena.c *** src/backend/utils/adt/varlena.c 6 Jul 2005 19:02:52 -0000 1.125 --- src/backend/utils/adt/varlena.c 7 Jul 2005 03:40:44 -0000 *************** *** 28,34 **** #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" - #include "utils/syscache.h" typedef struct varlena unknown; --- 28,33 ---- *************** *** 2364,2385 **** { /* On the first call lookup the datatype of the supplied argument */ Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); ! HeapTuple tp; ! int typlen; ! tp = SearchSysCache(TYPEOID, ! ObjectIdGetDatum(argtypeid), ! 0, 0, 0); ! if (!HeapTupleIsValid(tp)) { /* Oid not in pg_type, should never happen. */ ! ereport(ERROR, ! (errcode(ERRCODE_INTERNAL_ERROR), ! errmsg("invalid typid: %u", argtypeid))); } ! ! typlen = ((Form_pg_type)GETSTRUCT(tp))->typlen; ! ReleaseSysCache(tp); fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, sizeof(int)); *(int *)fcinfo->flinfo->fn_extra = typlen; --- 2363,2377 ---- { /* On the first call lookup the datatype of the supplied argument */ Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); ! int typlen = get_typlen(argtypeid); ! ! if (typlen == 0) { /* Oid not in pg_type, should never happen. */ ! elog(ERROR, "cache lookup failed for type %u", argtypeid); } ! fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, sizeof(int)); *(int *)fcinfo->flinfo->fn_extra = typlen;
Alvaro Herrera <alvherre@alvh.no-ip.org> writes: > On Thu, Jul 07, 2005 at 03:01:46PM +1200, Mark Kirkwood wrote: >> I have attached a little change to varlena.c that uses it. I left the >> ereport as it was, but am not fussed about it either way. > I am, because it gives useless messages to the translators to work on. Exactly. I had already made a private note to fix that patch --- inventing your own random wording and punctuation for an extremely standard error message is simply Not Acceptable. regards, tom lane
Neil Conway wrote: > Mark Kirkwood wrote: > >> I didn't performance test it, but the idea of hammering the catalogs for >> each value to be processed seemed a bad thing. > > > Well, the syscache already sits in front of the catalogs themselves. I'd > be curious to see what the performance difference actually is... > > I did some tests with a two tables, one small and one large, I am seeing a consistent difference favoring the first-call-only type checking: Table "public.dim1" Column | Type | Modifiers --------+------------------------+----------- d1key | integer | not null dat | character varying(100) | not null dattyp | character varying(20) | not null dfill | character varying(100) | not null INFO: "dim1": scanned 24 of 24 pages, containing 1001 live rows and 0 dead rows; 1001 rows in sample, 1001 estimated total rows SELECT max(pg_column_size(dfill)) FROM dim1 Run 1st call only check? elapsed(ms) 1 y 5.5 2 y 3.1 3 n 11.3 4 n 4.1 Now try a bigger table: Table "public.fact0" Column | Type | Modifiers --------+------------------------+----------- d0key | integer | not null d1key | integer | not null d2key | integer | not null fval | integer | not null ffill | character varying(100) | not null INFO: "fact0": scanned 3000 of 18182 pages, containing 165000 live rows and 0 dead rows; 3000 rows in sample, 1000010 estimated total rows SELECT max(pg_column_size(ffill)) FROM fact0 Run 1st call only check? elapsed(ms) 1 y 2497 2 y 2484 3 y 2496 4 y 2480 5 n 3572 6 n 3570 7 n 3558 8 n 3457
Mark Kirkwood <markir@paradise.net.nz> writes: > I did some tests with a two tables, one small and one large, I am seeing > a consistent difference favoring the first-call-only type checking: We had come to a similar conclusion in regards to making array-related functions cache lookup info across calls. It's worth checking the assumption every so often, but the result surprises me not at all. regards, tom lane
Tom Lane wrote: > Alvaro Herrera <alvherre@alvh.no-ip.org> writes: > >>On Thu, Jul 07, 2005 at 03:01:46PM +1200, Mark Kirkwood wrote: >> >>>I have attached a little change to varlena.c that uses it. I left the >>>ereport as it was, but am not fussed about it either way. > > >>I am, because it gives useless messages to the translators to work on. > > > Exactly. I had already made a private note to fix that patch --- > inventing your own random wording and punctuation for an extremely > standard error message is simply Not Acceptable. > Apologies all, my "not fussed about it" was highly misleading - I had already changed the error message as per Neil's suggestion in the revised patch, but that was not obvious from my comment :-( regards Mark
Patch applied. Thanks. --------------------------------------------------------------------------- Mark Kirkwood wrote: > Alvaro Herrera wrote: > > On Thu, Jul 07, 2005 at 03:01:46PM +1200, Mark Kirkwood wrote: > > > >>Neil Conway wrote: > > > > > >>>elog(ERROR) is usually used for "can't happen" errors. > >> > >>I have attached a little change to varlena.c that uses it. I left the > >>ereport as it was, but am not fussed about it either way. > > > > > > I am, because it gives useless messages to the translators to work on. > > elog parameters are not marked for translation, ereport are (errmsg and > > friends, really). So please don't do that. > > > > Ok, didn't realize the difference! Revised patch attached that uses elog. > > Neil, I will runs some tests to see if there is any performance saving > with the first-call-only business. > > Cheers > > Mark > > Index: src/backend/utils/adt/varlena.c > =================================================================== > RCS file: /projects/cvsroot/pgsql/src/backend/utils/adt/varlena.c,v > retrieving revision 1.125 > diff -c -r1.125 varlena.c > *** src/backend/utils/adt/varlena.c 6 Jul 2005 19:02:52 -0000 1.125 > --- src/backend/utils/adt/varlena.c 7 Jul 2005 03:40:44 -0000 > *************** > *** 28,34 **** > #include "utils/builtins.h" > #include "utils/lsyscache.h" > #include "utils/pg_locale.h" > - #include "utils/syscache.h" > > > typedef struct varlena unknown; > --- 28,33 ---- > *************** > *** 2364,2385 **** > { > /* On the first call lookup the datatype of the supplied argument */ > Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); > ! HeapTuple tp; > ! int typlen; > > ! tp = SearchSysCache(TYPEOID, > ! ObjectIdGetDatum(argtypeid), > ! 0, 0, 0); > ! if (!HeapTupleIsValid(tp)) > { > /* Oid not in pg_type, should never happen. */ > ! ereport(ERROR, > ! (errcode(ERRCODE_INTERNAL_ERROR), > ! errmsg("invalid typid: %u", argtypeid))); > } > ! > ! typlen = ((Form_pg_type)GETSTRUCT(tp))->typlen; > ! ReleaseSysCache(tp); > fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, > sizeof(int)); > *(int *)fcinfo->flinfo->fn_extra = typlen; > --- 2363,2377 ---- > { > /* On the first call lookup the datatype of the supplied argument */ > Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); > ! int typlen = get_typlen(argtypeid); > > ! > ! if (typlen == 0) > { > /* Oid not in pg_type, should never happen. */ > ! elog(ERROR, "cache lookup failed for type %u", argtypeid); > } > ! > fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, > sizeof(int)); > *(int *)fcinfo->flinfo->fn_extra = typlen; > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073