Re: futex results with dbt-3 - Mailing list pgsql-performance

From Manfred Spraul
Subject Re: futex results with dbt-3
Date
Msg-id 417221B5.1050704@colorfullife.com
Whole thread Raw
In response to futex results with dbt-3  (Mark Wong <markw@osdl.org>)
Responses Re: futex results with dbt-3
Re: futex results with dbt-3
List pgsql-performance
Neil wrote:

>. In any case, the "futex patch"
>uses the Linux 2.6 futex API to implement PostgreSQL spinlocks.
>
Has anyone tried to replace the whole lwlock implementation with
pthread_rwlock? At least for Linux with recent glibcs, pthread_rwlock is
implemented with futexes, i.e. we would get a fast lock handling without
os specific hacks. Perhaps other os contain user space pthread locks, too.
Attached is an old patch. I tested it on an uniprocessor system a year
ago and it didn't provide much difference, but perhaps the scalability
is better. You'll have to add -lpthread to the library list for linking.

Regarding Neil's patch:

>! /*
>!  * XXX: is there a more efficient way to write this? Perhaps using
>!  * decl...?
>!  */
>! static __inline__ slock_t
>! atomic_dec(volatile slock_t *ptr)
>! {
>!     slock_t prev = -1;
>!
>!     __asm__ __volatile__(
>!         "    lock        \n"
>!         "    xadd %0,%1    \n"
>!         :"=q"(prev)
>!         :"m"(*ptr), "0"(prev)
>!         :"memory", "cc");
>!
>!     return prev;
>! }
>
xadd is not supported by original 80386 cpus, it was added for 80486
cpus. There is no instruction in the 80386 cpu that allows to atomically
decrement and retrieve the old value of an integer. The only option are
atomic_dec_test_zero or atomic_dec_test_negative - that can be
implemented by looking at the sign/zero flag. Depending on what you want
this may be enough. Or make the futex code conditional for > 80386 cpus.

--
    Manfred
--- p7.3.3.orig/src/backend/storage/lmgr/lwlock.c    2002-09-25 22:31:40.000000000 +0200
+++ postgresql-7.3.3/src/backend/storage/lmgr/lwlock.c    2003-09-06 14:15:01.000000000 +0200
@@ -26,6 +26,28 @@
 #include "storage/proc.h"
 #include "storage/spin.h"

+#define USE_PTHREAD_LOCKS
+
+#ifdef USE_PTHREAD_LOCKS
+
+#include <pthread.h>
+#include <errno.h>
+typedef pthread_rwlock_t LWLock;
+
+inline static void
+InitLWLock(LWLock *p)
+{
+    pthread_rwlockattr_t rwattr;
+    int i;
+
+    pthread_rwlockattr_init(&rwattr);
+    pthread_rwlockattr_setpshared(&rwattr, PTHREAD_PROCESS_SHARED);
+    i=pthread_rwlock_init(p, &rwattr);
+    pthread_rwlockattr_destroy(&rwattr);
+    if (i)
+        elog(FATAL, "pthread_rwlock_init failed");
+}
+#else

 typedef struct LWLock
 {
@@ -38,6 +60,17 @@
     /* tail is undefined when head is NULL */
 } LWLock;

+inline static void
+InitLWLock(LWLock *lock)
+{
+    SpinLockInit(&lock->mutex);
+    lock->releaseOK = true;
+    lock->exclusive = 0;
+    lock->shared = 0;
+    lock->head = NULL;
+    lock->tail = NULL;
+}
+#endif
 /*
  * This points to the array of LWLocks in shared memory.  Backends inherit
  * the pointer by fork from the postmaster.  LWLockIds are indexes into
@@ -61,7 +94,7 @@
 static LWLockId held_lwlocks[MAX_SIMUL_LWLOCKS];


-#ifdef LOCK_DEBUG
+#if defined(LOCK_DEBUG) && !defined(USE_PTHREAD_LOCKS)
 bool        Trace_lwlocks = false;

 inline static void
@@ -153,12 +186,7 @@
      */
     for (id = 0, lock = LWLockArray; id < numLocks; id++, lock++)
     {
-        SpinLockInit(&lock->mutex);
-        lock->releaseOK = true;
-        lock->exclusive = 0;
-        lock->shared = 0;
-        lock->head = NULL;
-        lock->tail = NULL;
+        InitLWLock(lock);
     }

     /*
@@ -185,7 +213,116 @@
     return (LWLockId) (LWLockCounter[0]++);
 }

+#ifdef USE_PTHREAD_LOCKS
+/*
+ * LWLockAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ */
+void
+LWLockAcquire(LWLockId lockid, LWLockMode mode)
+{
+    int i;
+    PRINT_LWDEBUG("LWLockAcquire", lockid, &LWLockArray[lockid]);
+
+    /*
+     * We can't wait if we haven't got a PGPROC.  This should only occur
+     * during bootstrap or shared memory initialization.  Put an Assert
+     * here to catch unsafe coding practices.
+     */
+    Assert(!(proc == NULL && IsUnderPostmaster));
+
+    /*
+     * Lock out cancel/die interrupts until we exit the code section
+     * protected by the LWLock.  This ensures that interrupts will not
+     * interfere with manipulations of data structures in shared memory.
+     */
+    HOLD_INTERRUPTS();
+
+    if (mode == LW_EXCLUSIVE) {
+        i = pthread_rwlock_wrlock(&LWLockArray[lockid]);
+    } else {
+        i = pthread_rwlock_rdlock(&LWLockArray[lockid]);
+    }
+    if (i)
+        elog(FATAL, "Unexpected error from pthread_rwlock.");
+
+    /* Add lock to list of locks held by this backend */
+    Assert(num_held_lwlocks < MAX_SIMUL_LWLOCKS);
+    held_lwlocks[num_held_lwlocks++] = lockid;
+}
+
+/*
+ * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, return FALSE with no side-effects.
+ *
+ * If successful, cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
+{
+    int i;
+    PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, &LWLockArray[lockid]);
+
+    HOLD_INTERRUPTS();
+
+    if (mode == LW_EXCLUSIVE) {
+        i = pthread_rwlock_trywrlock(&LWLockArray[lockid]);
+    } else {
+        i = pthread_rwlock_tryrdlock(&LWLockArray[lockid]);
+    }
+    switch(i) {
+        case 0:
+            /* Add lock to list of locks held by this backend */
+            Assert(num_held_lwlocks < MAX_SIMUL_LWLOCKS);
+            held_lwlocks[num_held_lwlocks++] = lockid;
+            return true;
+        case EBUSY:
+            RESUME_INTERRUPTS();
+            return false;
+        default:
+            elog(FATAL, "Unexpected error from pthread_rwlock_try.");
+            return false;
+    }
+}
+
+/*
+ * LWLockRelease - release a previously acquired lock
+ */
+void
+LWLockRelease(LWLockId lockid)
+{
+    int i;
+
+    /*
+     * Remove lock from list of locks held.  Usually, but not always, it
+     * will be the latest-acquired lock; so search array backwards.
+     */
+    for (i = num_held_lwlocks; --i >= 0;)
+    {
+        if (lockid == held_lwlocks[i])
+            break;
+    }
+    if (i < 0)
+        elog(ERROR, "LWLockRelease: lock %d is not held", (int) lockid);
+    num_held_lwlocks--;
+    for (; i < num_held_lwlocks; i++)
+        held_lwlocks[i] = held_lwlocks[i + 1];
+
+    i = pthread_rwlock_unlock(&LWLockArray[lockid]);
+    if (i)
+        elog(FATAL, "Unexpected error from pthread_rwlock_unlock.");
+
+    /*
+     * Now okay to allow cancel/die interrupts.
+     */
+    RESUME_INTERRUPTS();
+}

+#else
 /*
  * LWLockAcquire - acquire a lightweight lock in the specified mode
  *
@@ -499,6 +636,7 @@
     RESUME_INTERRUPTS();
 }

+#endif

 /*
  * LWLockReleaseAll - release all currently-held locks

pgsql-performance by date:

Previous
From: Gavin Sherry
Date:
Subject: Re: Select with qualified join condition / Batch inserts
Next
From: Tom Lane
Date:
Subject: Re: futex results with dbt-3