For the record - I have also devised another solution to the
underlying problem (md5(bytea) cannot process large amounts
of input), chunked md5():
create or replace function lo_chunked_md5(oid, int)
returns text
language 'plpgsql'
stable strict
as '
DECLARE
lo_id alias for $1;
chunk_size alias for $2;
_lo_fd integer;
_lo_size integer;
_chunk_count integer;
_remainder integer;
_md5_concat text;
INV_READ constant integer := x''40000''::integer;
SEEK_SET constant integer := 0;
SEEK_END constant integer := 2;
BEGIN
-- check for existence of lo_id
_lo_fd := lo_open(lo_id, INV_READ);
-- get size
_lo_size := lo_lseek(_lo_fd, 0, SEEK_END);
PERFORM lo_close(_lo_fd);
-- calculate chunks and remainder
_chunk_count := _lo_size / chunk_size;
_remainder := _lo_size % chunk_size;
-- loop over chunks
_md5_concat := '''';
FOR _chunk_id in 1.._chunk_count LOOP
_md5_concat := _md5_concat || md5(lo_get(lo_id, (_chunk_id - 1) * chunk_size, chunk_size));
END LOOP;
-- add remainder
_md5_concat := _md5_concat || md5(lo_get(lo_id, _chunk_count * chunk_size, _remainder));
return md5(_md5_concat);
END;';
This can easily be mirrored by a client-side function, say,
def file2chunked_md5(filename=None, chunk_size=500*_MB):
_log.debug('chunked_md5(%s, %s)', filename, chunk_size)
md5_concat = u''
f = open(filename, 'rb')
while True:
md5 = hashlib.md5()
data = f.read(chunk_size)
if not data:
break
md5.update(data)
md5_concat += md5.hexdigest()
f.close()
md5 = hashlib.md5()
md5.update(md5_concat)
hex_digest = md5.hexdigest()
_log.debug('md5(%s): %s', md5_concat, hex_digest)
return hexdigest
in Python.
Still, I'd welcome a native, streaming md5(loid) which is
bound to be more optimized by design.
Karsten
--
GPG key ID E4071346 @ eu.pool.sks-keyservers.net
E167 67FD A291 2BEA 73BD 4537 78B9 A9F9 E407 1346