From d13aeccf8cd1056f1d98388e78e5ecdc4de56cea Mon Sep 17 00:00:00 2001 From: "David G. Johnston" Date: Mon, 28 Apr 2025 11:45:13 -0700 Subject: [PATCH 1/2] Modernize setup and development runtime environment Thanks to Jelte for most of this. I basically just moved the SQL code to a migration and stood up a dev environment via make. The downgrading of parentid to null-able was required though I do not know why. I added a couple of views to make interactive "select *" doable. Added an upsert to ensure that a dev setup has a list. I reworked the text search configurations from what was in schema.sql. I couldn't figure out how to make the existing code work and for my development needs it wasn't important. --- .gitignore | 6 + Makefile | 10 ++ README.md | 77 +++++++++ dev_requirements.txt | 2 + django/archives/example_settings_local.py | 19 +++ .../migrations/0005_sync_with_loader.py | 131 ++++++++++++++ .../migrations/0006_alter_message_parentid.py | 18 ++ django/archives/mailarchives/models.py | 2 +- django/run_dev.py | 22 +++ django/uwsgi_dev.ini | 10 ++ loader/archives.ini.sample | 16 +- loader/lib/parser.py | 2 +- loader/sql/schema.sql | 160 ------------------ requirements.txt | 7 + 14 files changed, 312 insertions(+), 170 deletions(-) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 dev_requirements.txt create mode 100644 django/archives/example_settings_local.py create mode 100644 django/archives/mailarchives/migrations/0005_sync_with_loader.py create mode 100644 django/archives/mailarchives/migrations/0006_alter_message_parentid.py create mode 100755 django/run_dev.py create mode 100644 django/uwsgi_dev.ini delete mode 100644 loader/sql/schema.sql create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ae1b7d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Ignore files in the Python virtual environment +/env/ + +# A convenient place to store downloaded mbox files from production +# to facilitate testing, and automation. +/mboxes/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..08cc20d --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +# A handy target to reset the development environment back to a clean slate +# and run the development server. +# XXX: For now just use the single mbox file that was previously downloaded. +# Additional work in this area, for testing use cases, is needed. +dev-rebuild-and-run: + dropdb --if-exists archives + createdb archives + django/manage.py migrate + loader/load_message.py --list pgsql-hackers --mbox mboxes/pgsql-hackers.202504 >/dev/null + cd ./django && ./run_dev.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..f5ae54a --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# PG archives + +This application manages PostgreSQL mailing list archives. However, the search +feature is implemented in pgweb. + +## The Application + +This is a Django 4.2 application backed by PostgreSQL and running on Python 3.x. + +## Getting Started + +### Ubuntu instructions + +First, prepare your development environment by installing python3, postgresql-server-dev-X.Y, formail and libtidy (use `--no-install-recommends` to avoid installing postfix): + +```bash +sudo apt install python3 postgresql-server-dev-14 procmail libtidy5deb1 --no-install-recommends +``` + +Next, configure your local environment with virtualenv and install local dependencies. + +```bash +python3 -m venv env +source env/bin/activate +pip install -r dev_requirements.txt +``` + +Create a database for the application: + +```bash +createdb archives +cd django +./manage.py migrate +# Creates pgsql-hackers list with ID 1 if open. +``` + +Create config for the loader scripts: + +```bash +cp loader/archives.ini.sample loader/archives.ini +``` + +Load some emails from the actual PostgreSQL archives by downloading an mbox +file from and running the +following command. NOTE: it's totally fine if some of the emails will fail to +load. + +```bash +loader/load_message.py --list pgsql-hackers --mbox /path/to/downloaded/mbox/file +``` + +Then go to the `django` directory, that's where the actual web application is. + +```bash +cd django +``` + +Create a local settings file (feel free to edit it): + +```bash +cp archives/example_settings_local.py archives/settings_local.py +``` + +Finally, you're ready to start the web application: + +```bash +./run_dev.py +``` + +Or, download the April 2025 mbox file from the PostgreSQL archives and place it in /mboxes. +Then run: +```bash +make dev-rebuild-and-run +``` + +Then open to view your local mailing +list archives. diff --git a/dev_requirements.txt b/dev_requirements.txt new file mode 100644 index 0000000..3b878a3 --- /dev/null +++ b/dev_requirements.txt @@ -0,0 +1,2 @@ +-r requirements.txt +uwsgi diff --git a/django/archives/example_settings_local.py b/django/archives/example_settings_local.py new file mode 100644 index 0000000..4657b62 --- /dev/null +++ b/django/archives/example_settings_local.py @@ -0,0 +1,19 @@ +# Enable more debugging information +DEBUG = True +# Prevent logging to try to send emails to postgresql.org admins. +# Use the default Django logging settings instead. +LOGGING = None + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.postgresql_psycopg2", + "NAME": "archives", + "USER": "postgres", + "PASSWORD": "postgres", + "HOST": "0.0.0.0", + } +} + +# Allow API access to all clients +PUBLIC_ARCHIVES = True +ALLOWED_HOSTS = ["*"] diff --git a/django/archives/mailarchives/migrations/0005_sync_with_loader.py b/django/archives/mailarchives/migrations/0005_sync_with_loader.py new file mode 100644 index 0000000..c12bb52 --- /dev/null +++ b/django/archives/mailarchives/migrations/0005_sync_with_loader.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.18 on 2019-06-19 19:02 +from __future__ import unicode_literals + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0004_resend_rate_limit'), + ] + + operations = [ + migrations.RunSQL( + """ +ALTER TABLE messages ADD COLUMN rawtxt bytea NOT NULL; +ALTER TABLE messages ADD COLUMN fti tsvector NOT NULL; +ALTER TABLE attachments ADD COLUMN attachment bytea NOT NULL; +CREATE TABLE loaderrors( + id SERIAL NOT NULL PRIMARY KEY, + listid int NOT NULL, + dat timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, + msgid text NOT NULL, + srctype text NOT NULL, + src text NOT NULL, + err text NOT NULL +); +CREATE SEQUENCE threadid_seq; +CREATE TABLE list_months( + listid int NOT NULL REFERENCES lists(listid), + year int NOT NULL, + month int NOT NULL, + CONSTRAINT list_months_pk PRIMARY KEY (listid, year, month) +); +CREATE TABLE list_threads( + threadid int NOT NULL, /* comes from threadid_seq */ + listid int NOT NULL REFERENCES lists(listid), + CONSTRAINT pg_list_threads PRIMARY KEY (threadid, listid) +); +CREATE INDEX list_threads_listid_idx ON list_threads(listid); +CREATE TABLE unresolved_messages( + message int NOT NULL REFERENCES messages, + priority int NOT NULL, + msgid text NOT NULL, + CONSTRAINT unresolved_messages_pkey PRIMARY KEY (message, priority) +); +CREATE UNIQUE INDEX idx_unresolved_msgid_message ON unresolved_messages(msgid, message); + +/* A couple of convenience views that exclude the content fields. */ +CREATE VIEW messages_meta AS + SELECT + id, + parentid, + threadid, + _from, + _to, + cc, + subject, + date, + has_attachment, + hiddenstatus, + messageid + FROM messages; + +CREATE VIEW attachments_meta AS + SELECT + id, + message, + filename, + contenttype + FROM attachments; + +INSERT INTO listgroups (groupid, groupname, sortkey) VALUES (1, 'Developer lists', 1) + ON CONFLICT (groupid) DO NOTHING; + +INSERT INTO lists (listid, listname, shortdesc, description, active, groupid, subscriber_access) + VALUES (1, 'pgsql-hackers', 'pgsql-hackers', -- implicit concatentation below + 'The PostgreSQL developers team lives here. ' + 'Discussion of current development issues, problems and bugs, and proposed new features. ' + 'If your question cannot be answered by people in the other lists, ' + 'and it is likely that only a developer will know the answer, you may re-post your question in this list. ' + 'You must try elsewhere first!', True, 1, True) + ON CONFLICT (listid) DO NOTHING; + +CREATE TEXT SEARCH CONFIGURATION pg (COPY=pg_catalog.english); + +/* +CREATE TEXT SEARCH DICTIONARY english_ispell ( + TEMPLATE = ispell, + DictFile = english, + AffFile = english, + StopWords = english +); +CREATE TEXT SEARCH DICTIONARY pg_dict ( + TEMPLATE = synonym, + SYNONYMS = pg_dict +); +CREATE TEXT SEARCH DICTIONARY pg_stop ( + TEMPLATE = simple, + StopWords = pg_dict +); +*/ +ALTER TEXT SEARCH CONFIGURATION pg + ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, + word, hword, hword_part + WITH english_stem; + +ALTER TEXT SEARCH CONFIGURATION pg + DROP MAPPING FOR email, url, url_path, sfloat, float; + +CREATE FUNCTION messages_fti_trigger_func() RETURNS trigger AS $$ +BEGIN + NEW.fti = setweight(to_tsvector('public.pg', coalesce(new.subject, '')), 'A') || + setweight(to_tsvector('public.pg', coalesce(new.bodytxt, '')), 'D'); + RETURN NEW; +END +$$ LANGUAGE 'plpgsql'; + +CREATE TRIGGER messages_fti_trigger + BEFORE INSERT OR UPDATE OF subject, bodytxt ON messages + FOR EACH ROW EXECUTE PROCEDURE messages_fti_trigger_func(); + +CREATE INDEX messages_fti_idx ON messages USING gin(fti); + + """, + ), + + ] diff --git a/django/archives/mailarchives/migrations/0006_alter_message_parentid.py b/django/archives/mailarchives/migrations/0006_alter_message_parentid.py new file mode 100644 index 0000000..16d56bc --- /dev/null +++ b/django/archives/mailarchives/migrations/0006_alter_message_parentid.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.20 on 2025-04-28 18:37 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0005_sync_with_loader'), + ] + + operations = [ + migrations.AlterField( + model_name='message', + name='parentid', + field=models.IntegerField(blank=True, null=True), + ), + ] diff --git a/django/archives/mailarchives/models.py b/django/archives/mailarchives/models.py index 44c4469..4f75acc 100644 --- a/django/archives/mailarchives/models.py +++ b/django/archives/mailarchives/models.py @@ -25,7 +25,7 @@ class Message(models.Model): messageid = models.TextField(null=False) bodytxt = models.TextField(null=False) # rawtxt is a bytea field, which django doesn't support (easily) - parentid = models.IntegerField(null=False, blank=False) + parentid = models.IntegerField(null=True, blank=True) has_attachment = models.BooleanField(null=False, default=False) hiddenstatus = models.IntegerField(null=True) # fti is a tsvector field, which django doesn't support (easily) diff --git a/django/run_dev.py b/django/run_dev.py new file mode 100755 index 0000000..268cd83 --- /dev/null +++ b/django/run_dev.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +from importlib.machinery import PathFinder +import subprocess +import sys + +django_path = PathFinder().find_spec("django").submodule_search_locations[0] + +django_admin_path = django_path + "/contrib/admin/static/admin" + +if len(sys.argv) > 1: + ini_file = sys.argv[1] +else: + ini_file = "uwsgi_dev.ini" + +subprocess.run( + [ + "uwsgi", + "--static-map", + f"/static/admin={django_path}/contrib/admin/static/admin", + ini_file, + ] +) diff --git a/django/uwsgi_dev.ini b/django/uwsgi_dev.ini new file mode 100644 index 0000000..9ab26b2 --- /dev/null +++ b/django/uwsgi_dev.ini @@ -0,0 +1,10 @@ +[uwsgi] +threads=1 +env=DJANGO_SETTINGS_MODULE=archives.settings +module=archives.wsgi:application +py-autoreload=1 +touch-reload = archives/settings.py +touch-reload = archives/settings_local.py +touch-reload = uwsgi_dev.ini +http=127.0.0.1:8001 +static-map=/media-archives=media diff --git a/loader/archives.ini.sample b/loader/archives.ini.sample index a146f69..1c3c6e2 100644 --- a/loader/archives.ini.sample +++ b/loader/archives.ini.sample @@ -2,16 +2,16 @@ connstr=dbname=archives [varnish] -purgeurl=https://wrigleys.postgresql.org/api/varnish/purge/ +#purgeurl=https://wrigleys.postgresql.org/api/varnish/purge/ [smtp] -server=localhost:9911 -heloname=localhost -resender=noreply@example.com +#server=localhost:9911 +#heloname=localhost +#resender=noreply@example.com [pglister] # synchronize subscribers between pgarchives and pglister -subscribers=0 -root=/path/to/pglister -myname=pgarchives -apikey=CHANGEME +#subscribers=0 +#root=/path/to/pglister +#myname=pgarchives +#apikey=CHANGEME diff --git a/loader/lib/parser.py b/loader/lib/parser.py index 027ed53..e81b193 100644 --- a/loader/lib/parser.py +++ b/loader/lib/parser.py @@ -62,7 +62,7 @@ class ArchivesParser(object): self.date = lowdate # Else we're going to go with what we found self.bodytxt = self.get_body() - self.attachments = [] + self.attachments = [] # (filename, contenttype, payload) self.get_attachments() if len(self.attachments) > 0: log.status("Found %s attachments" % len(self.attachments)) diff --git a/loader/sql/schema.sql b/loader/sql/schema.sql deleted file mode 100644 index be735d9..0000000 --- a/loader/sql/schema.sql +++ /dev/null @@ -1,160 +0,0 @@ -\set ON_ERROR_STOP on - -BEGIN; - -CREATE TABLE messages ( - id SERIAL NOT NULL PRIMARY KEY, - parentid int REFERENCES messages, - threadid int NOT NULL, - _from text NOT NULL, - _to text NOT NULL, - cc text NOT NULL, - subject text NOT NULL, - date timestamptz NOT NULL, - loaddate timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, - has_attachment boolean NOT NULL, - hiddenstatus int NULL, - messageid text NOT NULL, - bodytxt text NOT NULL, - rawtxt bytea NOT NULL, - fti tsvector NOT NULL -); -CREATE INDEX idx_messages_threadid ON messages(threadid); -CREATE UNIQUE INDEX idx_messages_msgid ON messages(messageid); -CREATE INDEX idx_messages_date ON messages(date); -CREATE INDEX idx_messages_parentid ON messages(parentid); - -CREATE TABLE message_hide_reasons ( - message int NOT NULL PRIMARY KEY REFERENCES messages, - dt timestamptz, - reason text, - by text -); - -CREATE SEQUENCE threadid_seq; - -CREATE TABLE unresolved_messages( - message int NOT NULL REFERENCES messages, - priority int NOT NULL, - msgid text NOT NULL, - CONSTRAINT unresolved_messages_pkey PRIMARY KEY (message, priority) -); - -CREATE UNIQUE INDEX idx_unresolved_msgid_message ON unresolved_messages(msgid, message); - -CREATE TABLE listgroups( - groupid int NOT NULL PRIMARY KEY, - groupname text NOT NULL UNIQUE, - sortkey int NOT NULL -); - -CREATE TABLE lists( - listid int NOT NULL PRIMARY KEY, - listname text NOT NULL UNIQUE, - shortdesc text NOT NULL, - description text NOT NULL, - active boolean NOT NULL, - groupid int NOT NULL REFERENCES listgroups(groupid) -); - -CREATE TABLE list_months( - listid int NOT NULL REFERENCES lists(listid), - year int NOT NULL, - month int NOT NULL, - CONSTRAINT list_months_pk PRIMARY KEY (listid, year, month) -); - -CREATE TABLE list_threads( - threadid int NOT NULL, /* comes from threadid_seq */ - listid int NOT NULL REFERENCES lists(listid), - CONSTRAINT pg_list_threads PRIMARY KEY (threadid, listid) -); -CREATE INDEX list_threads_listid_idx ON list_threads(listid); - -CREATE TABLE attachments( - id serial not null primary key, - message int not null references messages(id), - filename text not null, - contenttype text not null, - attachment bytea not null -); -CREATE INDEX idx_attachments_msg ON attachments(message); - -CREATE TABLE loaderrors( - id SERIAL NOT NULL PRIMARY KEY, - listid int NOT NULL, - dat timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, - msgid text NOT NULL, - srctype text NOT NULL, - src text NOT NULL, - err text NOT NULL -); - -/* textsearch configs */ -CREATE TEXT SEARCH CONFIGURATION pg (PARSER=tsparser); - -CREATE TEXT SEARCH DICTIONARY english_ispell ( - TEMPLATE = ispell, - DictFile = en_us, - AffFile = en_us, - StopWords = english -); -CREATE TEXT SEARCH DICTIONARY pg_dict ( - TEMPLATE = synonym, - SYNONYMS = pg_dict -); -CREATE TEXT SEARCH DICTIONARY pg_stop ( - TEMPLATE = simple, - StopWords = pg_dict -); -ALTER TEXT SEARCH CONFIGURATION pg - ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, - word, hword, hword_part - WITH pg_stop, pg_dict, english_ispell, english_stem; -ALTER TEXT SEARCH CONFIGURATION pg - DROP MAPPING FOR email, url, url_path, sfloat, float; - -CREATE FUNCTION messages_fti_trigger_func() RETURNS trigger AS $$ -BEGIN - NEW.fti = setweight(to_tsvector('public.pg', coalesce(new.subject, '')), 'A') || - setweight(to_tsvector('public.pg', coalesce(new.bodytxt, '')), 'D'); - RETURN NEW; -END -$$ LANGUAGE 'plpgsql'; - -CREATE TRIGGER messages_fti_trigger - BEFORE INSERT OR UPDATE OF subject, bodytxt ON messages - FOR EACH ROW EXECUTE PROCEDURE messages_fti_trigger_func(); -CREATE INDEX messages_fti_idx ON messages USING gin(fti); - -CREATE TABLE legacymap( - listid int not null, - year int not null, - month int not null, - msgnum int not null, - msgid text not null, -CONSTRAINT legacymap_pk PRIMARY KEY (listid, year, month, msgnum) -); - -/* Simple API for hiding messages */ -CREATE OR REPLACE FUNCTION hide_message(msgid_txt text, reason_code integer, user_txt text, reason_txt text) - RETURNS integer AS -$BODY$ -DECLARE - returned_id integer; -BEGIN - UPDATE messages SET hiddenstatus = reason_code WHERE messageid = msgid_txt RETURNING id INTO returned_id; - - IF NOT FOUND THEN - RAISE EXCEPTION 'The specified message (%) could not be found.', msgid_txt; - END IF; - - INSERT INTO message_hide_reasons (message, dt, reason, by) VALUES (returned_id, now(), reason_txt, user_txt); - - RETURN returned_id; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -\echo Dont forget to commit! diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0064172 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +django>=4.2,<4.3 +psycopg2 +requests +pycryptodome +pycryptodomex +python-dateutil +pytidylib -- 2.34.1