Thread: Per-document document statistics

Per-document document statistics

From
Berend Tober
Date:
I'm trying to figure out how to compile text search statistics on a per-document basis.
While I successfully compute text search statistics for the entire corpus with a call
to ts_stat after having inserted all documents, what I also want is to run ts_stat on
the tsvector for each row so as to get the term frequency per document.

Sample code and comments follow.


-- Dumped from database version 9.5.7

CREATE DATABASE nlp;

\connect nlp

CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;

SET search_path = public, pg_catalog;

-- This table stores one document per row

CREATE TABLE document (
     document_id serial primary key,
     content text NOT NULL,
     document_vector tsvector
);

-- This is the table I need help with how to to populate
-- with term frequency per document

CREATE TABLE document_statistics (
     document_id integer primary key,
     word text,
     ndoc bigint, /* this will be one, since there is only one document */
     nentry bigint /* this is the number of interest */
);


ALTER TABLE ONLY document_statistics
     ADD CONSTRAINT document_statistics_document_id_fkey
     FOREIGN KEY (document_id)
     REFERENCES document(document_id);


CREATE FUNCTION document_bit() RETURNS trigger
     LANGUAGE plpgsql
     AS $$
BEGIN
    -- Compile document statistics for each document upon insert

    SELECT to_tsvector('simple', new.content) INTO new.document_vector;    
    RETURN new;
END;
$$;

CREATE TRIGGER document_bit
     BEFORE INSERT OR UPDATE ON document
     FOR EACH ROW EXECUTE PROCEDURE document_bit();


-- Sample data

INSERT INTO document (content) VALUES ('Hello World!');
INSERT INTO document (content) VALUES ('The quick brown dog jumped over the lazy dog');
INSERT INTO document (content) VALUES ('One flew over the coo coo''s nest',);

-- Once all the individual documents are inserted, then
-- calculate overall corpus statistics

insert into corpus_statistics select * from ts_stat('select document_vector from document');


-- I'm thinking something like this proposed after insert trigger
-- is where I want to compute document statistics, but can't
-- figure out how to make it work

CREATE FUNCTION document_ait() RETURNS trigger
     LANGUAGE plpgsql
     AS $$
DECLARE
    word_stat record;
BEGIN
/*
    --Here's one pathetic try

    FOR word_stat IN
        select * from ts_stat('select * from ' || (new.document_vector))
        LOOP
        RAISE NOTICE '%' , word_stat;
        INSERT INTO public.document_statistics(
             document_id, word, ndoc, nentry)
        VALUES (new.document_id, word_stat.word, word_stat.ndoc, word_stat.nentry);
    END LOOP;
*/
    RETURN new;
END;
$$;

CREATE TRIGGER document_ait AFTER INSERT ON document FOR EACH ROW EXECUTE PROCEDURE document_ait();