I'm trying to figure out how to compile text search statistics on a per-document basis.
While I successfully compute text search statistics for the entire corpus with a call
to ts_stat after having inserted all documents, what I also want is to run ts_stat on
the tsvector for each row so as to get the term frequency per document.
Sample code and comments follow.
-- Dumped from database version 9.5.7
CREATE DATABASE nlp;
\connect nlp
CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;
SET search_path = public, pg_catalog;
-- This table stores one document per row
CREATE TABLE document (
document_id serial primary key,
content text NOT NULL,
document_vector tsvector
);
-- This is the table I need help with how to to populate
-- with term frequency per document
CREATE TABLE document_statistics (
document_id integer primary key,
word text,
ndoc bigint, /* this will be one, since there is only one document */
nentry bigint /* this is the number of interest */
);
ALTER TABLE ONLY document_statistics
ADD CONSTRAINT document_statistics_document_id_fkey
FOREIGN KEY (document_id)
REFERENCES document(document_id);
CREATE FUNCTION document_bit() RETURNS trigger
LANGUAGE plpgsql
AS $$
BEGIN
-- Compile document statistics for each document upon insert
SELECT to_tsvector('simple', new.content) INTO new.document_vector;
RETURN new;
END;
$$;
CREATE TRIGGER document_bit
BEFORE INSERT OR UPDATE ON document
FOR EACH ROW EXECUTE PROCEDURE document_bit();
-- Sample data
INSERT INTO document (content) VALUES ('Hello World!');
INSERT INTO document (content) VALUES ('The quick brown dog jumped over the lazy dog');
INSERT INTO document (content) VALUES ('One flew over the coo coo''s nest',);
-- Once all the individual documents are inserted, then
-- calculate overall corpus statistics
insert into corpus_statistics select * from ts_stat('select document_vector from document');
-- I'm thinking something like this proposed after insert trigger
-- is where I want to compute document statistics, but can't
-- figure out how to make it work
CREATE FUNCTION document_ait() RETURNS trigger
LANGUAGE plpgsql
AS $$
DECLARE
word_stat record;
BEGIN
/*
--Here's one pathetic try
FOR word_stat IN
select * from ts_stat('select * from ' || (new.document_vector))
LOOP
RAISE NOTICE '%' , word_stat;
INSERT INTO public.document_statistics(
document_id, word, ndoc, nentry)
VALUES (new.document_id, word_stat.word, word_stat.ndoc, word_stat.nentry);
END LOOP;
*/
RETURN new;
END;
$$;
CREATE TRIGGER document_ait AFTER INSERT ON document FOR EACH ROW EXECUTE PROCEDURE document_ait();