From cb8ff168e486a0423c8a24185d3e6927931ff74d Mon Sep 17 00:00:00 2001 From: "David G. Johnston" Date: Mon, 28 Apr 2025 13:12:20 -0700 Subject: [PATCH 2/2] Implement Advanced Searching Features and CFApp Integration Note: None of this is fully fleshed out. Mainly because it isn't my primary focus at the moment; I just needed to do enough to allow this to interoperate with CFApp. This patch removes all reliance on pgweb to search and view the archive. It should be fairly simple to incorporate the relevant changes into pgweb if desired but ideally this application should provide both the archive and the user interface. The present reliance of pgweb to generate string html and write it out using {{html | safe}} is largely abolished in favor of proper html element usage - divs and span with classes. Note in particular the abstract_found and abstract_after reworking. The added __str__ overrides on list and groups allows the drop-down selection list to show the proper labels. The related work in views.py is basically to continue to handle both the pgweb/simple search interface as well as this new advanced search interface. The search implementation has been factored out and the new search view added for local search - leaving archive-search for pgweb. A while back I made a feature request to limit search results to a single message per thread. I've implemented an initial version of that here. The advanced search page re-implemented here adds a checkbox to enable this mode; and the results show both the thread id and ranking within the thread, whether "oneperthread" is checked or not. This patch also introduces a new function named is_patch. This uses the same exact criteria that CFBot uses. The thinking here is that knowing which messages and threads contains patches is useful context to use while searching or within search results. The immediate use for this information is the newly added threads view. Presently coded for demonstration purposes only, this view presents the most recent threads that contain patches. With this information it is possible, and demonstrated, to actually push all of the needed information from pgarchvies to pgcommitfest to create a new patch, with reasonable defaults. For the moment it is created using CFBot as the creator but given our single sign-on authentication filtering the thread results on user and creating the patch with them as author should be doable. Content-Type now gets passed around in the attachment metadata. --- django/archives/example_settings_local.py | 2 + django/archives/mailarchives/api.py | 171 ++++++++++- .../migrations/0007_add_is_patch_function.py | 24 ++ django/archives/mailarchives/models.py | 6 + .../templates/advancedsearch.html | 81 ++++++ .../mailarchives/templates/threads.html | 117 ++++++++ django/archives/mailarchives/views.py | 266 ++++++++++++++---- django/archives/urls.py | 5 + 8 files changed, 609 insertions(+), 63 deletions(-) create mode 100644 django/archives/mailarchives/migrations/0007_add_is_patch_function.py create mode 100644 django/archives/mailarchives/templates/advancedsearch.html create mode 100644 django/archives/mailarchives/templates/threads.html diff --git a/django/archives/example_settings_local.py b/django/archives/example_settings_local.py index 4657b62..6ff29fe 100644 --- a/django/archives/example_settings_local.py +++ b/django/archives/example_settings_local.py @@ -17,3 +17,5 @@ DATABASES = { # Allow API access to all clients PUBLIC_ARCHIVES = True ALLOWED_HOSTS = ["*"] + +PGWEB_ADDRESS = 'http://localhost:8001' diff --git a/django/archives/mailarchives/api.py b/django/archives/mailarchives/api.py index a6b2536..135fc0c 100644 --- a/django/archives/mailarchives/api.py +++ b/django/archives/mailarchives/api.py @@ -1,13 +1,15 @@ from django.http import HttpResponse, HttpResponseForbidden from django.shortcuts import get_object_or_404 from django.conf import settings +from django.db import connection import ipaddress from .views import cache from .models import Message, List import json - +import requests +from django.http import JsonResponse def is_host_allowed(request): for ip_range in settings.API_CLIENTS: @@ -117,9 +119,174 @@ def thread(request, msgid): 'date': m.date.isoformat(), 'from': m.mailfrom, 'subj': m.subject, - 'atts': [{'id': a.id, 'name': a.filename} for a in m.attachment_set.all()], + 'atts': [{'id': a.id, 'name': a.filename, 'is_patch': a.is_patch, 'content_type': a.contenttype} + for a in m.attachment_set.extra(select={'is_patch': 'attachments.is_patch'}).all()], } for m in mlist], resp) if settings.PUBLIC_ARCHIVES: resp['xkey'] = 'pgat_{0}'.format(msg.threadid) return resp + +def threads_with_patches(request): + if not settings.PUBLIC_ARCHIVES: + return HttpResponseForbidden('No API access on private archives for now') + + with connection.cursor() as cursor: + cursor.execute("""-- Find threads with patches + select * + from ( + select distinct on (threadid) + pm.threadid, + pm.id, + pm._from, + pm.subject, + pm.messageid, + ma.patch_count, + tm.subject AS thread_subject, + pm.date AS patch_date, + tm.date AS thread_date, + tm.messageid AS thread_messageid + from messages AS pm --patch message + -- threadid is a shared value but not a foreign key to anything + -- in particular, it is not a self-join of messages + join lateral ( + select * + from messages as im + where im.threadid = pm.threadid + order by im.date asc + limit 1 + ) AS tm on true --thread message is first known message + join lateral ( + select count(*) as patch_count + from attachments + where pm.id = attachments.message and is_patch(attachments) + ) as ma on true + where pm.has_attachment and ma.patch_count > 0 and pm.hiddenstatus is null + order by pm.threadid, pm.date desc + ) as threads_with_patches + order by patch_date DESC + limit 10; + """) + rows = cursor.fetchall() + + # Convert the SQL result into thread_list + thread_list = [ + { + "thread_id": str(row[0]), + "message_id": row[1], + "file_count": row[5], + "file_version": None, + "commit_sha": None, + "patch_id": None, + "subject_line": row[3], + "thread_subject": row[6], + "sender": row[2], + "id": row[1], + "patch_date": row[7].strftime('%Y-%m-%d %H:%M:%S') if row[7] else None, + "thread_date": row[8].strftime('%Y-%m-%d %H:%M:%S') if row[8] else None, + "message_code": row[4], + "thread_code": row[9] + } + for row in rows + ] + + resp = HttpResponse(content_type='application/json') + json.dump(thread_list, resp) + + return resp + +def get_patch_data_as_json(threadid, messageid): + with connection.cursor() as cursor: + cursor.execute("""-- Find threads with patches + select + pm.threadid, + pm.id, + tm.messageid as thread_messageid, + mrm.mostrecent_messageid, + pm.messageid as patch_messageid, + ma.fileset, + pm._from as patch_from_author, + tm.date as thread_messagedate, + mrm.mostrecent_messagedate, + pm.date as patch_messagedate, + tm.subject as thread_subject_line, + mrm.most_recent_subject_line, + mrm.most_recent_from_author, + tm._from as thread_from_author + from messages AS pm --patch message + join lateral ( + select * + from messages as im + where im.threadid = pm.threadid + order by im.date asc + limit 1 + ) AS tm on true --thread message is first known message + join lateral ( + select + id as mostrecent_id, + messageid as mostrecent_messageid, + date as mostrecent_messagedate, + subject as most_recent_subject_line, + _from as most_recent_from_author + from messages + where threadid = pm.threadid + order by date desc limit 1 + ) as mrm on true + join lateral ( + select jsonb_agg( + jsonb_build_object( + 'attachment_id', a.id, + 'filename', a.filename, + 'content_type', a.contenttype, + 'is_patch', is_patch(a) + ) order by a.filename) as fileset + from attachments as a + where pm.id = a.message + ) as ma on true + where pm.id = %s; + """, + (messageid,)) + row = cursor.fetchone() + + # Convert the SQL result into patch_data + patch_data = { + "thread_id": row[0], + "message_id": row[1], + "thread_message_id": row[2], + "most_recent_message_id": row[3], + "patch_message_id": row[4], + "patch_from_author": row[6], + "fileset": json.loads(row[5]) if row[5] else [], + "thread_message_date": row[7].isoformat() if row[7] else None, + "most_recent_message_date": row[8].isoformat() if row[8] else None, + "patch_message_date": row[9].isoformat() if row[9] else None, + "thread_subject_line": row[10], + "most_recent_subject_line": row[11], + "most_recent_from_author": row[12], + "thread_from_author": row[13], + } + + return json.dumps(patch_data) + +def create_cfapp_patch(request): + if not settings.PUBLIC_ARCHIVES: + return HttpResponseForbidden('No API access on private archives for now') + + if request.method != 'POST': + return JsonResponse({'error': 'Invalid request method'}, status=405) + + body_string = request.body.decode("utf-8") + body_json = json.loads(body_string) + + try: + # Forward the request body to the external service + response = requests.post( + 'http://localhost:8007/api/test/cfapp/create_patch', + headers={'Content-Type': 'application/json'}, + data=get_patch_data_as_json(body_json["thread_id"], body_json["message_id"]), + ) + + # Return the response from the external service + return JsonResponse(response.json(), status=response.status_code) + except requests.RequestException as e: + return JsonResponse({'error': f'Failed to proxy request: {str(e)}'}, status=500) diff --git a/django/archives/mailarchives/migrations/0007_add_is_patch_function.py b/django/archives/mailarchives/migrations/0007_add_is_patch_function.py new file mode 100644 index 0000000..1420926 --- /dev/null +++ b/django/archives/mailarchives/migrations/0007_add_is_patch_function.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.18 on 2019-06-19 19:02 +from __future__ import unicode_literals + +from django.db import migrations + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0006_alter_message_parentid'), + ] + + operations = [ + migrations.RunSQL( + """ +CREATE FUNCTION is_patch(att attachments) RETURNS boolean LANGUAGE sql IMMUTABLE STRICT + RETURN (att).filename ~ '\.(diff|diff\.gz|patch|patch\.gz|tar\.gz|tgz|tar\.bz2|zip)$'; +; + """, + reverse_sql=""" +DROP FUNCTION is_patch(att attachments); + """, + ), + ] diff --git a/django/archives/mailarchives/models.py b/django/archives/mailarchives/models.py index 4f75acc..c0ff6ea 100644 --- a/django/archives/mailarchives/models.py +++ b/django/archives/mailarchives/models.py @@ -74,6 +74,9 @@ class ListGroup(models.Model): groupname = models.CharField(max_length=200, null=False, blank=False) sortkey = models.IntegerField(null=False) + def __str__(self): + return self.groupname + class Meta: db_table = 'listgroups' @@ -93,6 +96,9 @@ class List(models.Model): return self.shortdesc return self.listname + def __str__(self): + return self.listname + class Meta: db_table = 'lists' diff --git a/django/archives/mailarchives/templates/advancedsearch.html b/django/archives/mailarchives/templates/advancedsearch.html new file mode 100644 index 0000000..7f9d9b1 --- /dev/null +++ b/django/archives/mailarchives/templates/advancedsearch.html @@ -0,0 +1,81 @@ +{%extends "page.html"%} +{%block title%}PostgreSQL Mailing Lists Search{%endblock%} + +{%block contents%} +

PostgreSQL Mailing Lists Search

+ +
+
+
+ +
+ + + + +
+
+
Examples 2025-04 (early)
+
CANqtF-pgb87qQr94rMeWKsAa2JGBw9Ygo_wH2bzvVZpi4Mnaig@mail.gmail.com
+
+
+ + +
+
+ + +
+
+ + +
+
+ +
+
+
+
+ +{%if search_error %} +
{{search_error}}
+{%elif query == '' %} +
Click the magnifying glass on the query box to search.
+{%else%} + + {%if hitcount == 0 %} +

Your search for {{query}} returned no hits.

+ {%else%} +

Results {{firsthit}}-{{lasthit}} of {%if hitcount == 1000%}more than 1000{%else%}{{hitcount}}{%endif%}.

+ {%if pagelinks %}Result pages: {{pagelinks|safe}}

{%endif%} + {%for hit in hits %} +
+
{{forloop.counter0|add:firsthit}}. {{hit.subject}} [{{hit.rank|floatformat:2}}]
+
From {{hit.author}} on {{hit.date}}.
+
Thread# {{hit.threadid}} Rank: {{hit.thread_rank}}
+
{{hit.abstract_found}} {{hit.abstract_after}}
+
{{PGWEB_ADDRESS}}/message-id/{{hit.messageid}}
+
+ {%endfor%} + {%if pagelinks %}Result pages: {{pagelinks|safe}}

{%endif%} + {%endif%} +{%endif%} + +{%endblock%} diff --git a/django/archives/mailarchives/templates/threads.html b/django/archives/mailarchives/templates/threads.html new file mode 100644 index 0000000..9def39c --- /dev/null +++ b/django/archives/mailarchives/templates/threads.html @@ -0,0 +1,117 @@ +{%extends "page.html"%} +{%block title%}PostgreSQL Mailing List Archives{%endblock%} +{%load pgfilters%} +{%block contents%} +

Thread Viewer

+ + + + + + + + + + + + + + + + + + + + + + +
Subject LineSenderThread SubjectFile CountFile VersionCommit SHAPatch IDPatch DateThread DateIDAction
+ + + +{%endblock%} diff --git a/django/archives/mailarchives/views.py b/django/archives/mailarchives/views.py index 69172bd..647f561 100644 --- a/django/archives/mailarchives/views.py +++ b/django/archives/mailarchives/views.py @@ -706,67 +706,32 @@ def resend_complete(request, messageid): }) -@csrf_exempt -def search(request): - if not settings.PUBLIC_ARCHIVES: - # We don't support searching of non-public archives at all at this point. - # XXX: room for future improvement - return HttpResponseForbidden('Not public archives') - # Only certain hosts are allowed to call the search API - allowed = False - for ip_range in settings.SEARCH_CLIENTS: - if ipaddress.ip_address(request.META['REMOTE_ADDR']) in ipaddress.ip_network(ip_range): - allowed = True - break - if not allowed: - return HttpResponseForbidden('Invalid host') +def perform_search(query, datecode, sortcode, oneperthread=False, listid=None, listnames=None, streamer=None): + if not query and not streamer: + return [] - curs = connection.cursor() + if not query and streamer: + return False - # Perform a search of the archives and return a JSON document. - # Expects the following (optional) POST parameters: - # q = query to search for - # ln = comma separate list of listnames to search in - # d = number of days back to search for, or -1 (or not specified) - # to search the full archives - # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date] - if not request.method == 'POST': - raise Http404('I only respond to POST') + if listid and listnames: + raise Exception("Cannot specify both listid and listname") - if 'q' not in request.POST: - raise Http404('No search query specified') - query = request.POST['q'] + curs = connection.cursor() - if 'ln' in request.POST: + lists = None + if listnames: try: curs.execute("SELECT listid FROM lists WHERE listname=ANY(%(names)s)", { - 'names': request.POST['ln'].split(','), + 'names': listnames.split(','), }) lists = [x for x, in curs.fetchall()] except Exception: # If failing to parse list of lists, just search all lists = None - else: - lists = None - if 'd' in request.POST: - days = int(request.POST['d']) - if days < 1 or days > 365: - firstdate = None - else: - firstdate = datetime.now() - timedelta(days=days) - else: - firstdate = None - - if 's' in request.POST: - list_sort = request.POST['s'] - if list_sort not in ('d', 'r', 'i'): - list_stort = 'r' - else: - list_sort = 'r' - - # Ok, we have all we need to do the search + if listid: + lists = [listid] if query.find('@') > 0: cleaned_id = query.strip().removeprefix('<').removesuffix('>') @@ -778,15 +743,59 @@ def search(request): }) a = curs.fetchall() if len(a) == 1: - # Yup, this was a messageid - resp = HttpResponse(content_type='application/json') - - json.dump({'messageidmatch': 1}, resp) - return resp + if streamer: + json.dump({'messageidmatch': 1}, streamer) + else: + return [{'messageidmatch': cleaned_id}] # If not found, fall through to a regular search + firstdate = None + if datecode: + days = int(datecode) + if days >= 1 and days <= 365: + firstdate = datetime.now() - timedelta(days=days) + + list_sort = 'i' + if sortcode: + if sortcode in ('d', 'r', 'i'): + list_sort = sortcode + curs.execute("SET gin_fuzzy_search_limit=10000") - qstr = "SELECT messageid, date, subject, _from, ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), ts_headline(bodytxt, plainto_tsquery('public.pg', %(q)s),'StartSel=\"[[[[[[\",StopSel=\"]]]]]]\"') FROM messages m WHERE fti @@ plainto_tsquery('public.pg', %(q)s)" + + qstr = """-- Search for messages matching query -- +SELECT * FROM ( +SELECT + *, +""" + qstr += " row_number() over (partition by threadid order by " + if list_sort == 'r': + qstr += "ts_rank_cd DESC" + elif list_sort == 'd': + qstr += "date DESC" + else: + qstr += "date ASC" + qstr += ") AS thread_rank" + + qstr +=""" +FROM +( + SELECT + messageid, + threadid, + date, + subject, + _from, + ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), + ts_headline( + bodytxt, + plainto_tsquery('public.pg', %(q)s), + 'StartSel=\"[[[[[[\", + StopSel=\"]]]]]]\"' + ) + FROM messages m + WHERE fti @@ plainto_tsquery('public.pg', %(q)s) +""" + params = { 'q': query, } @@ -796,18 +805,22 @@ def search(request): if firstdate: qstr += " AND m.date > %(date)s" params['date'] = firstdate + + qstr += ") AS finding ) AS ranking" + + if oneperthread: + qstr += " WHERE thread_rank = 1" + if list_sort == 'r': - qstr += " ORDER BY ts_rank_cd(fti, plainto_tsquery(%(q)s)) DESC LIMIT 1000" + qstr += " ORDER BY ts_rank_cd DESC LIMIT 1000" elif list_sort == 'd': qstr += " ORDER BY date DESC LIMIT 1000" else: qstr += " ORDER BY date ASC LIMIT 1000" curs.execute(qstr, params) - - resp = HttpResponse(content_type='application/json') - - json.dump([ + if streamer: + json.dump([ { 'm': messageid, 'd': date.isoformat(), @@ -815,10 +828,141 @@ def search(request): 'f': mailfrom, 'r': rank, 'a': abstract.replace("[[[[[[", "").replace("]]]]]]", ""), - } for messageid, date, subject, mailfrom, rank, abstract in curs.fetchall()], - resp) + } for messageid, threadid, date, subject, mailfrom, rank, abstract, thread_rank in curs.fetchall()], + streamer) + return True + else: + return [ + { + 'm': messageid, + 't': threadid, + 'tr': thread_rank, + 'd': date.isoformat(), + 's': subject, + 'f': mailfrom, + 'r': rank, + 'a': abstract.replace("[[[[[[", "").replace("]]]]]]", ""), + 'a_found': abstract[abstract.find("[[[[[[") + 6:abstract.find("]]]]]]")], + 'a_after': abstract.replace(abstract[abstract.find("[[[[[["):abstract.find("]]]]]]") + 6], "").replace("[[[[[[", "").replace("]]]]]]", ""), + } for messageid, threadid, date, subject, mailfrom, rank, abstract, thread_rank in curs.fetchall()] + + +def advanced_search(request): + """ + 'pagelinks': " ".join( + generate_pagelinks(pagenum, + (totalhits - 1) // hitsperpage + 1, + querystr)), + """ + queryval = request.GET.get('q', None) + sortval = request.GET.get('s', 'd') + dateval = request.GET.get('d', '-1') + oneperthread = request.GET.get('r', '0') + + listid = 1 + + hits = perform_search(queryval, dateval, sortval, oneperthread=='1', listid=listid) + + totalhits = len(hits) + + if totalhits == 1: + # might be a messageid match + if 'messageidmatch' in hits[0]: + return HttpResponseRedirect('/message-id/%s' % hits[0]['messageidmatch']) + + firsthit = 1 + hitsperpage = 20 + + sortoptions = ( + {'val': 'r', 'text': 'Rank', 'selected': request.GET.get('s', '') not in ('d', 'i')}, + {'val': 'd', 'text': 'Date', 'selected': request.GET.get('s', '') == 'd'}, + {'val': 'i', 'text': 'Reverse date', 'selected': request.GET.get('s', '') == 'i'}, + ) + + dateoptions = ( + {'val': -1, 'text': 'anytime'}, + {'val': 1, 'text': 'within last day'}, + {'val': 7, 'text': 'within last week'}, + {'val': 31, 'text': 'within last month'}, + {'val': 186, 'text': 'within last 6 months'}, + {'val': 365, 'text': 'within last year'}, + ) + + (groups, listgroupid) = get_all_groups_and_lists(request) + return render_nav(NavContext(request, all_groups=groups), 'advancedsearch.html', { + 'groups': [{'groupname': g['groupname'], 'lists': g['lists']} for g in groups], + 'hitcount': totalhits, + 'firsthit': firsthit, + 'lasthit': min(totalhits, firsthit + hitsperpage - 1), + 'query': request.GET['q'] if 'q' in request.GET else '', + 'archives_root': '/', #settings.ARCHIVES_FRONT_ADDRESS, + 'pagelinks': '', + 'hits': [{ + 'date': h['d'], + 'subject': h['s'], + 'author': h['f'], + 'messageid': h['m'], + 'threadid': h['t'], + 'thread_rank': h['tr'], + 'abstract': h['a'], + 'abstract_found': h['a_found'], + 'abstract_after': h['a_after'], + 'rank': h['r'], + } for h in hits[firsthit - 1:firsthit + hitsperpage - 1]], + 'sortoptions': sortoptions, + 'lists': List.objects.all().order_by("group__sortkey"), + 'listid': listid, + 'dates': dateoptions, + 'dateval': dateval, + 'oneperthread': oneperthread, + }) + +@csrf_exempt +def search(request): + if not settings.PUBLIC_ARCHIVES: + # We don't support searching of non-public archives at all at this point. + # XXX: room for future improvement + return HttpResponseForbidden('Not public archives') + + # Only certain hosts are allowed to call the search API + allowed = False + for ip_range in settings.SEARCH_CLIENTS: + if ipaddress.ip_address(request.META['REMOTE_ADDR']) in ipaddress.ip_network(ip_range): + allowed = True + break + if not allowed: + return HttpResponseForbidden('Invalid host') + + # Perform a search of the archives and return a JSON document. + # Expects the following (optional) POST parameters: + # q = query to search for + # ln = comma separate list of listnames to search in + # d = number of days back to search for, or -1 (or not specified) + # to search the full archives + # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date] + if not request.method == 'POST': + raise Http404('I only respond to POST') + + if 'q' not in request.POST: + raise Http404('No search query specified') + query = request.POST['q'] + ln = request.POST['ln'] if 'ln' in request.POST else None + + dateval = request.POST.get('d', '-1') + sortval = request.POST.get('s', 'i') + + resp = HttpResponse(content_type='application/json') + perform_search(query, dateval, sortval, listname=ln, streamer=resp) return resp +def threads(request): + return render( + request, + 'threads.html', + { + 'request': request, + }) + @cache(seconds=10) def web_sync_timestamp(request): diff --git a/django/archives/urls.py b/django/archives/urls.py index 993c9f9..e10fb00 100644 --- a/django/archives/urls.py +++ b/django/archives/urls.py @@ -9,6 +9,10 @@ import archives.mailarchives.views import archives.mailarchives.api urlpatterns = [ + re_path(r'^threads/', archives.mailarchives.views.threads), + re_path(r'^api/threads_with_patches/', archives.mailarchives.api.threads_with_patches), + re_path(r'^api/create_cfapp_patch$', archives.mailarchives.api.create_cfapp_patch), + # Examples: # re_path(r'^$', 'archives.views.home', name='home), # re_path(r'^archives/', include('archives.foo.urls')), @@ -44,6 +48,7 @@ urlpatterns = [ # Search re_path(r'^archives-search/', archives.mailarchives.views.search), + re_path(r'^search/$', archives.mailarchives.views.advanced_search), # Date etc indexes re_path(r'^list/([\w-]+)/$', archives.mailarchives.views.monthlist), -- 2.34.1