ssi-tuple-not-row-locking-4.patch
text/x-diff
Filename: ssi-tuple-not-row-locking-4.patch
Type: text/x-diff
Part: 0
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: context
| File | + | − |
|---|---|---|
| src/backend/access/heap/heapam.c | 2 | 0 |
| src/backend/access/index/indexam.c | 1 | 0 |
| src/backend/storage/lmgr/predicate.c | 98 | 3 |
| src/backend/storage/lmgr/README-SSI | 48 | 0 |
| src/include/storage/predicate.h | 0 | 0 |
| src/test/isolation/expected/multiple-row-versions.out | 1 | 0 |
| src/test/isolation/specs/multiple-row-versions.spec | 2 | 3 |
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 1529,1535 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
OffsetNumber offnum;
bool at_chain_start;
bool valid;
- bool match_found;
if (all_dead)
*all_dead = true;
--- 1529,1534 ----
***************
*** 1539,1545 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
offnum = ItemPointerGetOffsetNumber(tid);
at_chain_start = true;
- match_found = false;
/* Scan through possible multiple members of HOT-chain */
for (;;)
--- 1538,1543 ----
***************
*** 1597,1606 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
PredicateLockTuple(relation, &heapTuple);
if (all_dead)
*all_dead = false;
! if (IsolationIsSerializable())
! match_found = true;
! else
! return true;
}
/*
--- 1595,1601 ----
PredicateLockTuple(relation, &heapTuple);
if (all_dead)
*all_dead = false;
! return true;
}
/*
***************
*** 1629,1635 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
break; /* end of chain */
}
! return match_found;
}
/*
--- 1624,1630 ----
break; /* end of chain */
}
! return false;
}
/*
***************
*** 2855,2866 **** l2:
END_CRIT_SECTION();
- /*
- * Any existing SIREAD locks on the old tuple must be linked to the new
- * tuple for conflict detection purposes.
- */
- PredicateLockTupleRowVersionLink(relation, &oldtup, heaptup);
-
if (newbuf != buffer)
LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- 2850,2855 ----
*** a/src/backend/access/index/indexam.c
--- b/src/backend/access/index/indexam.c
***************
*** 612,619 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
* any more members. Otherwise, check for continuation of the
* HOT-chain, and set state for next time.
*/
! if (IsMVCCSnapshot(scan->xs_snapshot)
! && !IsolationIsSerializable())
scan->xs_next_hot = InvalidOffsetNumber;
else if (HeapTupleIsHotUpdated(heapTuple))
{
--- 612,618 ----
* any more members. Otherwise, check for continuation of the
* HOT-chain, and set state for next time.
*/
! if (IsMVCCSnapshot(scan->xs_snapshot))
scan->xs_next_hot = InvalidOffsetNumber;
else if (HeapTupleIsHotUpdated(heapTuple))
{
*** a/src/backend/storage/lmgr/README-SSI
--- b/src/backend/storage/lmgr/README-SSI
***************
*** 402,407 **** is based on the top level xid. When looking at an xid that comes
--- 402,455 ----
from a tuple's xmin or xmax, for example, we always call
SubTransGetTopmostTransaction() before doing much else with it.
+ * PostgreSQL does not use "update in place" with a rollback log
+ for its MVCC implementation. Where possible it uses "HOT" updates on
+ the same page (if there is room and no indexed value is changed).
+ For non-HOT updates the old tuple is expired in place and a new tuple
+ is inserted at a new location. Because of this difference, a tuple
+ lock in PostgreSQL doesn't automatically lock any other versions of a
+ row. We don't try to copy or expand a tuple lock to any other
+ versions of the row, based on the following proof that any additional
+ serialization failures we would get from that would be false
+ positives:
+
+ o If transaction T1 reads a row (thus acquiring a predicate
+ lock on it) and a second transaction T2 updates that row, must a
+ third transaction T3 which updates the new version of the row have a
+ rw-conflict in from T1 to prevent anomalies? In other words, does it
+ matter whether this edge T1 -> T3 is there?
+
+ o If T1 has a conflict in, it certainly doesn't. Adding the
+ edge T1 -> T3 would create a dangerous structure, but we already had
+ one from the edge T1 -> T2, so we would have aborted something
+ anyway.
+
+ o Now let's consider the case where T1 doesn't have a
+ conflict in. If that's the case, for this edge T1 -> T3 to make a
+ difference, T3 must have a rw-conflict out that induces a cycle in
+ the dependency graph, i.e. a conflict out to some transaction
+ preceding T1 in the serial order. (A conflict out to T1 would work
+ too, but that would mean T1 has a conflict in and we would have
+ rolled back.)
+
+ o So now we're trying to figure out if there can be an
+ rw-conflict edge T3 -> T0, where T0 is some transaction that precedes
+ T1. For T0 to precede T1, there has to be has to be some edge, or
+ sequence of edges, from T0 to T1. At least the last edge has to be a
+ wr-dependency or ww-dependency rather than a rw-conflict, because T1
+ doesn't have a rw-conflict in. And that gives us enough information
+ about the order of transactions to see that T3 can't have a
+ rw-dependency to T0:
+ - T0 committed before T1 started (the wr/ww-dependency implies this)
+ - T1 started before T2 committed (the T1->T2 rw-conflict implies this)
+ - T2 committed before T3 started (otherwise, T3 would be aborted
+ because of an update conflict)
+
+ o That means T0 committed before T3 started, and therefore
+ there can't be a rw-conflict from T3 to T0.
+
+ o In both cases, we didn't need the T1 -> T3 edge.
+
* Predicate locking in PostgreSQL will start at the tuple level
when possible, with automatic conversion of multiple fine-grained
locks to coarser granularity as need to avoid resource exhaustion.
*** a/src/backend/storage/lmgr/predicate.c
--- b/src/backend/storage/lmgr/predicate.c
***************
*** 155,163 ****
* BlockNumber newblkno);
* PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
* BlockNumber newblkno);
- * PredicateLockTupleRowVersionLink(const Relation relation,
- * const HeapTuple oldTuple,
- * const HeapTuple newTuple)
* ReleasePredicateLocks(bool isCommit)
*
* conflict detection (may also trigger rollback)
--- 155,160 ----
***************
*** 2252,2341 **** PredicateLockTuple(const Relation relation, const HeapTuple tuple)
PredicateLockAcquire(&tag);
}
- /*
- * If the old tuple has any predicate locks, copy them to the new target.
- *
- * This is called at an UPDATE, where any predicate locks held on the old
- * tuple need to be copied to the new tuple, because logically they both
- * represent the same row. A lock taken before the update must conflict
- * with anyone locking the same row after the update.
- */
- void
- PredicateLockTupleRowVersionLink(const Relation relation,
- const HeapTuple oldTuple,
- const HeapTuple newTuple)
- {
- PREDICATELOCKTARGETTAG oldtupletag;
- PREDICATELOCKTARGETTAG oldpagetag;
- PREDICATELOCKTARGETTAG newtupletag;
- BlockNumber oldblk,
- newblk;
- OffsetNumber oldoff,
- newoff;
- TransactionId oldxmin,
- newxmin;
-
- /*
- * Bail out quickly if there are no serializable transactions
- * running.
- *
- * It's safe to do this check without taking any additional
- * locks. Even if a serializable transaction starts concurrently,
- * we know it can't take any SIREAD locks on the modified tuple
- * because the caller is holding the associated buffer page lock.
- * Memory reordering isn't an issue; the memory barrier in the
- * LWLock acquisition guarantees that this read occurs while the
- * buffer page lock is held.
- */
- if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
- return;
-
- oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self));
- oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self));
- oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data);
-
- newblk = ItemPointerGetBlockNumber(&(newTuple->t_self));
- newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self));
- newxmin = HeapTupleHeaderGetXmin(newTuple->t_data);
-
- SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag,
- relation->rd_node.dbNode,
- relation->rd_id,
- oldblk,
- oldoff,
- oldxmin);
-
- SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag,
- relation->rd_node.dbNode,
- relation->rd_id,
- oldblk);
-
- SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag,
- relation->rd_node.dbNode,
- relation->rd_id,
- newblk,
- newoff,
- newxmin);
-
- /*
- * A page-level lock on the page containing the old tuple counts too.
- * Anyone holding a lock on the page is logically holding a lock on the
- * old tuple, so we need to acquire a lock on his behalf on the new tuple
- * too. However, if the new tuple is on the same page as the old one, the
- * old page-level lock already covers the new tuple.
- *
- * A relation-level lock always covers both tuple versions, so we don't
- * need to worry about those here.
- */
- LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
-
- TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false);
- if (newblk != oldblk)
- TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false);
-
- LWLockRelease(SerializablePredicateLockListLock);
- }
-
/*
* DeleteLockTarget
--- 2249,2254 ----
***************
*** 2650,2658 **** PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,
/*
* Bail out quickly if there are no serializable transactions
! * running. As with PredicateLockTupleRowVersionLink, it's safe to
! * check this without taking locks because the caller is holding
! * the buffer page lock.
*/
if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
return;
--- 2563,2577 ----
/*
* Bail out quickly if there are no serializable transactions
! * running.
! *
! * It's safe to do this check without taking any additional
! * locks. Even if a serializable transaction starts concurrently,
! * we know it can't take any SIREAD locks on the page being split
! * because the caller is holding the associated buffer page lock.
! * Memory reordering isn't an issue; the memory barrier in the
! * LWLock acquisition guarantees that this read occurs while the
! * buffer page lock is held.
*/
if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
return;
***************
*** 3890,3897 **** FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
}
/*
! * Check whether we should roll back one of these transactions
! * instead of flagging a new rw-conflict.
*/
static void
OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
--- 3809,3825 ----
}
/*
! * We are about to add a RW-edge to the dependency graph - check that we don't
! * introduce a dangerous structure by doing so, and abort one of the
! * transactions if so.
! *
! * A serialization failure can only occur if there is a dangerous structure
! * in the dependency graph:
! *
! * Tin ------> Tpivot ------> Tout
! * rw rw
! *
! * Furthermore, Tout must commit first.
*/
static void
OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
***************
*** 3905,3992 **** OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
failure = false;
/*
! * Check for already-committed writer with rw-conflict out flagged. This
! * means that the reader must immediately fail.
*/
if (SxactIsCommitted(writer)
&& (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
failure = true;
/*
! * Check whether the reader has become a pivot with a committed writer. If
! * so, we must roll back unless every in-conflict either committed before
! * the writer committed or is READ ONLY and overlaps the writer.
*/
! if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader))
{
! if (SxactHasSummaryConflictIn(reader))
{
failure = true;
conflict = NULL;
}
else
conflict = (RWConflict)
! SHMQueueNext(&reader->inConflicts,
! &reader->inConflicts,
! offsetof(RWConflictData, inLink));
while (conflict)
{
! if (!SxactIsRolledBack(conflict->sxactOut)
! && (!SxactIsCommitted(conflict->sxactOut)
! || conflict->sxactOut->commitSeqNo >= writer->commitSeqNo)
! && (!SxactIsReadOnly(conflict->sxactOut)
! || conflict->sxactOut->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
{
failure = true;
break;
}
conflict = (RWConflict)
! SHMQueueNext(&reader->inConflicts,
! &conflict->inLink,
! offsetof(RWConflictData, inLink));
}
}
/*
! * Check whether the writer has become a pivot with an out-conflict
! * committed transaction, while neither reader nor writer is committed. If
! * the reader is a READ ONLY transaction, there is only a serialization
! * failure if an out-conflict transaction causing the pivot committed
! * before the reader acquired its snapshot. (That is, the reader must not
! * have been concurrent with the out-conflict transaction.)
*/
! if (!failure && !SxactIsCommitted(writer))
{
! if (SxactHasSummaryConflictOut(reader))
{
failure = true;
conflict = NULL;
}
else
conflict = (RWConflict)
! SHMQueueNext(&writer->outConflicts,
! &writer->outConflicts,
! offsetof(RWConflictData, outLink));
while (conflict)
{
! if ((reader == conflict->sxactIn && SxactIsCommitted(reader))
! || (SxactIsCommitted(conflict->sxactIn)
! && !SxactIsCommitted(reader)
! && (!SxactIsReadOnly(reader)
! || conflict->sxactIn->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot)))
{
failure = true;
break;
}
conflict = (RWConflict)
! SHMQueueNext(&writer->outConflicts,
! &conflict->outLink,
! offsetof(RWConflictData, outLink));
}
}
if (failure)
{
if (MySerializableXact == writer)
{
LWLockRelease(SerializableXactHashLock);
--- 3833,3958 ----
failure = false;
/*
! * Check for already-committed writer with rw-conflict out flagged:
! *
! * R ------> W ------> T2
! * rw rw
! *
! * That is a dangerous structure, so we must abort. (Since the writer
! * has already committed, we must be the reader)
! *
! * XXX: for an anomaly to occur, T2 must've committed before W. Couldn't
! * we easily check that here, or does the fact that W committed already
! * somehow imply it?
*/
if (SxactIsCommitted(writer)
&& (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
failure = true;
/*
! * Check whether the writer has become a pivot with an out-conflict
! * committed transaction (T2), and T2 committed first:
! *
! * R ------> W ------> T2
! * rw rw
! *
! * Because T2 must've committed first, there is no anomaly if:
! * - the reader committed before T2
! * - the writer committed before T2
! * - the reader is a READ ONLY transaction and the reader was not
! * concurrent with T2 (= reader acquired its snapshot after T2 committed)
! *
! * XXX: Where does that last condition come from?
*/
! if (!failure)
{
! if (SxactHasSummaryConflictOut(writer))
{
failure = true;
conflict = NULL;
}
else
conflict = (RWConflict)
! SHMQueueNext(&writer->outConflicts,
! &writer->outConflicts,
! offsetof(RWConflictData, outLink));
while (conflict)
{
! SERIALIZABLEXACT *t2 = conflict->sxactIn;
!
! if (SxactIsCommitted(t2)
! && (!SxactIsCommitted(reader)
! || t2->commitSeqNo <= reader->commitSeqNo)
! && (!SxactIsCommitted(writer)
! || t2->commitSeqNo <= writer->commitSeqNo)
! && (!SxactIsReadOnly(reader)
! || t2->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
{
failure = true;
break;
}
conflict = (RWConflict)
! SHMQueueNext(&writer->outConflicts,
! &conflict->outLink,
! offsetof(RWConflictData, outLink));
}
}
/*
! * Check whether the reader has become a pivot with a committed writer:
! *
! * T0 ------> R ------> W
! * rw rw
! *
! * Because W must've committed first for an anomaly to occur, there is no
! * anomaly if:
! * - T0 committed before the writer
! * - T0 is READ ONLY, and overlaps the writer
! *
! * XXX: again, where does that last condition come from?
*/
! if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader))
{
! if (SxactHasSummaryConflictIn(reader))
{
failure = true;
conflict = NULL;
}
else
conflict = (RWConflict)
! SHMQueueNext(&reader->inConflicts,
! &reader->inConflicts,
! offsetof(RWConflictData, inLink));
while (conflict)
{
! SERIALIZABLEXACT *t0 = conflict->sxactOut;
!
! if (!SxactIsRolledBack(t0)
! && (!SxactIsCommitted(t0)
! || t0->commitSeqNo >= writer->commitSeqNo)
! && (!SxactIsReadOnly(t0)
! || t0->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
{
failure = true;
break;
}
conflict = (RWConflict)
! SHMQueueNext(&reader->inConflicts,
! &conflict->inLink,
! offsetof(RWConflictData, inLink));
}
}
if (failure)
{
+ /*
+ * We have to kill a transaction to avoid a possible anomaly from
+ * occurring. If the writer is us, we can just ereport() to cause
+ * a transaction abort. Otherwise we flag the writer for termination,
+ * causing it to abort when it tries to commit. However, if the writer
+ * is a prepared transaction, already prepared, we can't abort it
+ * anymore, so we have to kill the reader instead.
+ */
if (MySerializableXact == writer)
{
LWLockRelease(SerializableXactHashLock);
***************
*** 3999,4004 **** OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
--- 3965,3973 ----
else if (SxactIsPrepared(writer))
{
LWLockRelease(SerializableXactHashLock);
+
+ /* if we're not the writer, we have to be the reader */
+ Assert(MySerializableXact == reader);
ereport(ERROR,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("could not serialize access due to read/write dependencies among transactions"),
*** a/src/include/storage/predicate.h
--- b/src/include/storage/predicate.h
***************
*** 47,53 **** extern void RegisterPredicateLockingXid(const TransactionId xid);
extern void PredicateLockRelation(const Relation relation);
extern void PredicateLockPage(const Relation relation, const BlockNumber blkno);
extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple);
- extern void PredicateLockTupleRowVersionLink(const Relation relation, const HeapTuple oldTuple, const HeapTuple newTuple);
extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
extern void ReleasePredicateLocks(const bool isCommit);
--- 47,52 ----
*** a/src/test/isolation/expected/multiple-row-versions.out
--- b/src/test/isolation/expected/multiple-row-versions.out
***************
*** 19,24 **** id txt
1
step c4: COMMIT;
step c3: COMMIT;
- ERROR: could not serialize access due to read/write dependencies among transactions
step wz1: UPDATE t SET txt = 'a' WHERE id = 1;
step c1: COMMIT;
--- 19,24 ----
1
step c4: COMMIT;
step c3: COMMIT;
step wz1: UPDATE t SET txt = 'a' WHERE id = 1;
+ ERROR: could not serialize access due to read/write dependencies among transactions
step c1: COMMIT;
*** a/src/test/isolation/specs/multiple-row-versions.spec
--- b/src/test/isolation/specs/multiple-row-versions.spec
***************
*** 1,8 ****
# Multiple Row Versions test
#
! # This test is designed to ensure that predicate locks taken on one version
! # of a row are detected as conflicts when a later version of the row is
! # updated or deleted by a transaction concurrent to the reader.
#
# Due to long permutation setup time, we are only testing one specific
# permutation, which should get a serialization error.
--- 1,7 ----
# Multiple Row Versions test
#
! # This test is designed to cover some code paths which only occur with
! # four or more transactions interacting with particular timings.
#
# Due to long permutation setup time, we are only testing one specific
# permutation, which should get a serialization error.