ssi-tuple-not-row-locking-4.patch

text/x-diff
Filename: ssi-tuple-not-row-locking-4.patch
Type: text/x-diff
Part: 0
Message: Re: SSI predicate locking on heap -- tuple or row?
Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: context
File	+	−
src/backend/access/heap/heapam.c	2	0
src/backend/access/index/indexam.c	1	0
src/backend/storage/lmgr/predicate.c	98	3
src/backend/storage/lmgr/README-SSI	48	0
src/include/storage/predicate.h	0	0
src/test/isolation/expected/multiple-row-versions.out	1	0
src/test/isolation/specs/multiple-row-versions.spec	2	3
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 1529,1535 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
  	OffsetNumber offnum;
  	bool		at_chain_start;
  	bool		valid;
- 	bool		match_found;
  
  	if (all_dead)
  		*all_dead = true;
--- 1529,1534 ----
***************
*** 1539,1545 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
  	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
  	offnum = ItemPointerGetOffsetNumber(tid);
  	at_chain_start = true;
- 	match_found = false;
  
  	/* Scan through possible multiple members of HOT-chain */
  	for (;;)
--- 1538,1543 ----
***************
*** 1597,1606 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
  			PredicateLockTuple(relation, &heapTuple);
  			if (all_dead)
  				*all_dead = false;
! 			if (IsolationIsSerializable())
! 				match_found = true;
! 			else
! 				return true;
  		}
  
  		/*
--- 1595,1601 ----
  			PredicateLockTuple(relation, &heapTuple);
  			if (all_dead)
  				*all_dead = false;
! 			return true;
  		}
  
  		/*
***************
*** 1629,1635 **** heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
  			break;				/* end of chain */
  	}
  
! 	return match_found;
  }
  
  /*
--- 1624,1630 ----
  			break;				/* end of chain */
  	}
  
! 	return false;
  }
  
  /*
***************
*** 2855,2866 **** l2:
  
  	END_CRIT_SECTION();
  
- 	/*
- 	 * Any existing SIREAD locks on the old tuple must be linked to the new
- 	 * tuple for conflict detection purposes.
- 	 */
- 	PredicateLockTupleRowVersionLink(relation, &oldtup, heaptup);
- 
  	if (newbuf != buffer)
  		LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- 2850,2855 ----
*** a/src/backend/access/index/indexam.c
--- b/src/backend/access/index/indexam.c
***************
*** 612,619 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot)
! 					&& !IsolationIsSerializable())
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
--- 612,618 ----
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot))
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
*** a/src/backend/storage/lmgr/README-SSI
--- b/src/backend/storage/lmgr/README-SSI
***************
*** 402,407 **** is based on the top level xid.  When looking at an xid that comes
--- 402,455 ----
  from a tuple's xmin or xmax, for example, we always call
  SubTransGetTopmostTransaction() before doing much else with it.
  
+     * PostgreSQL does not use "update in place" with a rollback log
+ for its MVCC implementation.  Where possible it uses "HOT" updates on
+ the same page (if there is room and no indexed value is changed).
+ For non-HOT updates the old tuple is expired in place and a new tuple
+ is inserted at a new location.  Because of this difference, a tuple
+ lock in PostgreSQL doesn't automatically lock any other versions of a
+ row.  We don't try to copy or expand a tuple lock to any other
+ versions of the row, based on the following proof that any additional
+ serialization failures we would get from that would be false
+ positives:
+ 
+           o If transaction T1 reads a row (thus acquiring a predicate
+ lock on it) and a second transaction T2 updates that row, must a
+ third transaction T3 which updates the new version of the row have a
+ rw-conflict in from T1 to prevent anomalies?  In other words, does it
+ matter whether this edge T1 -> T3 is there?
+ 
+           o If T1 has a conflict in, it certainly doesn't. Adding the
+ edge T1 -> T3 would create a dangerous structure, but we already had
+ one from the edge T1 -> T2, so we would have aborted something
+ anyway.
+ 
+           o Now let's consider the case where T1 doesn't have a
+ conflict in. If that's the case, for this edge T1 -> T3 to make a
+ difference, T3 must have a rw-conflict out that induces a cycle in
+ the dependency graph, i.e. a conflict out to some transaction
+ preceding T1 in the serial order. (A conflict out to T1 would work
+ too, but that would mean T1 has a conflict in and we would have
+ rolled back.)
+ 
+           o So now we're trying to figure out if there can be an
+ rw-conflict edge T3 -> T0, where T0 is some transaction that precedes
+ T1. For T0 to precede T1, there has to be has to be some edge, or
+ sequence of edges, from T0 to T1. At least the last edge has to be a
+ wr-dependency or ww-dependency rather than a rw-conflict, because T1
+ doesn't have a rw-conflict in. And that gives us enough information
+ about the order of transactions to see that T3 can't have a
+ rw-dependency to T0:
+  - T0 committed before T1 started (the wr/ww-dependency implies this)
+  - T1 started before T2 committed (the T1->T2 rw-conflict implies this)
+  - T2 committed before T3 started (otherwise, T3 would be aborted
+                                    because of an update conflict)
+ 
+           o That means T0 committed before T3 started, and therefore
+ there can't be a rw-conflict from T3 to T0.
+ 
+           o In both cases, we didn't need the T1 -> T3 edge.
+ 
      * Predicate locking in PostgreSQL will start at the tuple level
  when possible, with automatic conversion of multiple fine-grained
  locks to coarser granularity as need to avoid resource exhaustion.
*** a/src/backend/storage/lmgr/predicate.c
--- b/src/backend/storage/lmgr/predicate.c
***************
*** 155,163 ****
   *							   BlockNumber newblkno);
   *		PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
   *								 BlockNumber newblkno);
-  *		PredicateLockTupleRowVersionLink(const Relation relation,
-  *										 const HeapTuple oldTuple,
-  *										 const HeapTuple newTuple)
   *		ReleasePredicateLocks(bool isCommit)
   *
   * conflict detection (may also trigger rollback)
--- 155,160 ----
***************
*** 2252,2341 **** PredicateLockTuple(const Relation relation, const HeapTuple tuple)
  	PredicateLockAcquire(&tag);
  }
  
- /*
-  * If the old tuple has any predicate locks, copy them to the new target.
-  *
-  * This is called at an UPDATE, where any predicate locks held on the old
-  * tuple need to be copied to the new tuple, because logically they both
-  * represent the same row. A lock taken before the update must conflict
-  * with anyone locking the same row after the update.
-  */
- void
- PredicateLockTupleRowVersionLink(const Relation relation,
- 								 const HeapTuple oldTuple,
- 								 const HeapTuple newTuple)
- {
- 	PREDICATELOCKTARGETTAG oldtupletag;
- 	PREDICATELOCKTARGETTAG oldpagetag;
- 	PREDICATELOCKTARGETTAG newtupletag;
- 	BlockNumber oldblk,
- 				newblk;
- 	OffsetNumber oldoff,
- 				newoff;
- 	TransactionId oldxmin,
- 				newxmin;
- 
- 	/*
- 	 * Bail out quickly if there are no serializable transactions
- 	 * running.
- 	 *
- 	 * It's safe to do this check without taking any additional
- 	 * locks. Even if a serializable transaction starts concurrently,
- 	 * we know it can't take any SIREAD locks on the modified tuple
- 	 * because the caller is holding the associated buffer page lock.
- 	 * Memory reordering isn't an issue; the memory barrier in the
- 	 * LWLock acquisition guarantees that this read occurs while the
- 	 * buffer page lock is held.
- 	 */
- 	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
- 		return;
- 
- 	oldblk = ItemPointerGetBlockNumber(&(oldTuple->t_self));
- 	oldoff = ItemPointerGetOffsetNumber(&(oldTuple->t_self));
- 	oldxmin = HeapTupleHeaderGetXmin(oldTuple->t_data);
- 
- 	newblk = ItemPointerGetBlockNumber(&(newTuple->t_self));
- 	newoff = ItemPointerGetOffsetNumber(&(newTuple->t_self));
- 	newxmin = HeapTupleHeaderGetXmin(newTuple->t_data);
- 
- 	SET_PREDICATELOCKTARGETTAG_TUPLE(oldtupletag,
- 									 relation->rd_node.dbNode,
- 									 relation->rd_id,
- 									 oldblk,
- 									 oldoff,
- 									 oldxmin);
- 
- 	SET_PREDICATELOCKTARGETTAG_PAGE(oldpagetag,
- 									relation->rd_node.dbNode,
- 									relation->rd_id,
- 									oldblk);
- 
- 	SET_PREDICATELOCKTARGETTAG_TUPLE(newtupletag,
- 									 relation->rd_node.dbNode,
- 									 relation->rd_id,
- 									 newblk,
- 									 newoff,
- 									 newxmin);
- 
- 	/*
- 	 * A page-level lock on the page containing the old tuple counts too.
- 	 * Anyone holding a lock on the page is logically holding a lock on the
- 	 * old tuple, so we need to acquire a lock on his behalf on the new tuple
- 	 * too. However, if the new tuple is on the same page as the old one, the
- 	 * old page-level lock already covers the new tuple.
- 	 *
- 	 * A relation-level lock always covers both tuple versions, so we don't
- 	 * need to worry about those here.
- 	 */
- 	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
- 
- 	TransferPredicateLocksToNewTarget(oldtupletag, newtupletag, false);
- 	if (newblk != oldblk)
- 		TransferPredicateLocksToNewTarget(oldpagetag, newtupletag, false);
- 
- 	LWLockRelease(SerializablePredicateLockListLock);
- }
- 
  
  /*
   *		DeleteLockTarget
--- 2249,2254 ----
***************
*** 2650,2658 **** PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno,
  
  	/*
  	 * Bail out quickly if there are no serializable transactions
! 	 * running. As with PredicateLockTupleRowVersionLink, it's safe to
! 	 * check this without taking locks because the caller is holding
! 	 * the buffer page lock.
  	 */
  	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
  		return;
--- 2563,2577 ----
  
  	/*
  	 * Bail out quickly if there are no serializable transactions
! 	 * running.
! 	 *
! 	 * It's safe to do this check without taking any additional
! 	 * locks. Even if a serializable transaction starts concurrently,
! 	 * we know it can't take any SIREAD locks on the page being split
! 	 * because the caller is holding the associated buffer page lock.
! 	 * Memory reordering isn't an issue; the memory barrier in the
! 	 * LWLock acquisition guarantees that this read occurs while the
! 	 * buffer page lock is held.
  	 */
  	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
  		return;
***************
*** 3890,3897 **** FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
  }
  
  /*
!  * Check whether we should roll back one of these transactions
!  * instead of flagging a new rw-conflict.
   */
  static void
  OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
--- 3809,3825 ----
  }
  
  /*
!  * We are about to add a RW-edge to the dependency graph - check that we don't
!  * introduce a dangerous structure by doing so, and abort one of the
!  * transactions if so.
!  *
!  * A serialization failure can only occur if there is a dangerous structure
!  * in the dependency graph:
!  *
!  *		Tin ------> Tpivot ------> Tout
!  *	   		  rw			 rw
!  *
!  * Furthermore, Tout must commit first.
   */
  static void
  OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
***************
*** 3905,3992 **** OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
  	failure = false;
  
  	/*
! 	 * Check for already-committed writer with rw-conflict out flagged. This
! 	 * means that the reader must immediately fail.
  	 */
  	if (SxactIsCommitted(writer)
  	  && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
  		failure = true;
  
  	/*
! 	 * Check whether the reader has become a pivot with a committed writer. If
! 	 * so, we must roll back unless every in-conflict either committed before
! 	 * the writer committed or is READ ONLY and overlaps the writer.
  	 */
! 	if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader))
  	{
! 		if (SxactHasSummaryConflictIn(reader))
  		{
  			failure = true;
  			conflict = NULL;
  		}
  		else
  			conflict = (RWConflict)
! 				SHMQueueNext(&reader->inConflicts,
! 							 &reader->inConflicts,
! 							 offsetof(RWConflictData, inLink));
  		while (conflict)
  		{
! 			if (!SxactIsRolledBack(conflict->sxactOut)
! 				&& (!SxactIsCommitted(conflict->sxactOut)
! 					|| conflict->sxactOut->commitSeqNo >= writer->commitSeqNo)
! 				&& (!SxactIsReadOnly(conflict->sxactOut)
! 					|| conflict->sxactOut->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
  			{
  				failure = true;
  				break;
  			}
  			conflict = (RWConflict)
! 				SHMQueueNext(&reader->inConflicts,
! 							 &conflict->inLink,
! 							 offsetof(RWConflictData, inLink));
  		}
  	}
  
  	/*
! 	 * Check whether the writer has become a pivot with an out-conflict
! 	 * committed transaction, while neither reader nor writer is committed. If
! 	 * the reader is a READ ONLY transaction, there is only a serialization
! 	 * failure if an out-conflict transaction causing the pivot committed
! 	 * before the reader acquired its snapshot.  (That is, the reader must not
! 	 * have been concurrent with the out-conflict transaction.)
  	 */
! 	if (!failure && !SxactIsCommitted(writer))
  	{
! 		if (SxactHasSummaryConflictOut(reader))
  		{
  			failure = true;
  			conflict = NULL;
  		}
  		else
  			conflict = (RWConflict)
! 				SHMQueueNext(&writer->outConflicts,
! 							 &writer->outConflicts,
! 							 offsetof(RWConflictData, outLink));
  		while (conflict)
  		{
! 			if ((reader == conflict->sxactIn && SxactIsCommitted(reader))
! 				|| (SxactIsCommitted(conflict->sxactIn)
! 					&& !SxactIsCommitted(reader)
! 					&& (!SxactIsReadOnly(reader)
! 						|| conflict->sxactIn->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot)))
  			{
  				failure = true;
  				break;
  			}
  			conflict = (RWConflict)
! 				SHMQueueNext(&writer->outConflicts,
! 							 &conflict->outLink,
! 							 offsetof(RWConflictData, outLink));
  		}
  	}
  
  	if (failure)
  	{
  		if (MySerializableXact == writer)
  		{
  			LWLockRelease(SerializableXactHashLock);
--- 3833,3958 ----
  	failure = false;
  
  	/*
! 	 * Check for already-committed writer with rw-conflict out flagged:
! 	 *
! 	 *		R ------> W ------> T2
! 	 *			rw        rw
! 	 *
! 	 * That is a dangerous structure, so we must abort. (Since the writer
! 	 * has already committed, we must be the reader)
! 	 *
! 	 * XXX: for an anomaly to occur, T2 must've committed before W. Couldn't
! 	 * we easily check that here, or does the fact that W committed already
! 	 * somehow imply it?
  	 */
  	if (SxactIsCommitted(writer)
  	  && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
  		failure = true;
  
  	/*
! 	 * Check whether the writer has become a pivot with an out-conflict
! 	 * committed transaction (T2), and T2 committed first:
! 	 *
! 	 *		R ------> W ------> T2
! 	 *			rw        rw
! 	 *
! 	 * Because T2 must've committed first, there is no anomaly if:
! 	 * - the reader committed before T2
! 	 * - the writer committed before T2
! 	 * - the reader is a READ ONLY transaction and the reader was not
! 	 *   concurrent with T2 (= reader acquired its snapshot after T2 committed)
! 	 *
! 	 * XXX: Where does that last condition come from?
  	 */
! 	if (!failure)
  	{
! 		if (SxactHasSummaryConflictOut(writer))
  		{
  			failure = true;
  			conflict = NULL;
  		}
  		else
  			conflict = (RWConflict)
! 				SHMQueueNext(&writer->outConflicts,
! 							 &writer->outConflicts,
! 							 offsetof(RWConflictData, outLink));
  		while (conflict)
  		{
! 			SERIALIZABLEXACT *t2 = conflict->sxactIn;
! 
! 			if (SxactIsCommitted(t2)
! 				&& (!SxactIsCommitted(reader)
! 					|| t2->commitSeqNo <= reader->commitSeqNo)
! 				&& (!SxactIsCommitted(writer)
! 					|| t2->commitSeqNo <= writer->commitSeqNo)
! 				&& (!SxactIsReadOnly(reader)
! 					|| t2->commitSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
  			{
  				failure = true;
  				break;
  			}
  			conflict = (RWConflict)
! 				SHMQueueNext(&writer->outConflicts,
! 							 &conflict->outLink,
! 							 offsetof(RWConflictData, outLink));
  		}
  	}
  
  	/*
! 	 * Check whether the reader has become a pivot with a committed writer:
! 	 *
! 	 *		T0 ------> R ------> W
! 	 *			 rw        rw
! 	 *
! 	 * Because W must've committed first for an anomaly to occur, there is no
! 	 * anomaly if:
! 	 * - T0 committed before the writer
! 	 * - T0 is READ ONLY, and overlaps the writer
! 	 *
! 	 * XXX: again, where does that last condition come from?
  	 */
! 	if (!failure && SxactIsCommitted(writer) && !SxactIsReadOnly(reader))
  	{
! 		if (SxactHasSummaryConflictIn(reader))
  		{
  			failure = true;
  			conflict = NULL;
  		}
  		else
  			conflict = (RWConflict)
! 				SHMQueueNext(&reader->inConflicts,
! 							 &reader->inConflicts,
! 							 offsetof(RWConflictData, inLink));
  		while (conflict)
  		{
! 			SERIALIZABLEXACT *t0 = conflict->sxactOut;
! 
! 			if (!SxactIsRolledBack(t0)
! 				&& (!SxactIsCommitted(t0)
! 					|| t0->commitSeqNo >= writer->commitSeqNo)
! 				&& (!SxactIsReadOnly(t0)
! 					|| t0->SeqNo.lastCommitBeforeSnapshot >= writer->commitSeqNo))
  			{
  				failure = true;
  				break;
  			}
  			conflict = (RWConflict)
! 				SHMQueueNext(&reader->inConflicts,
! 							 &conflict->inLink,
! 							 offsetof(RWConflictData, inLink));
  		}
  	}
  
  	if (failure)
  	{
+ 		/*
+ 		 * We have to kill a transaction to avoid a possible anomaly from
+ 		 * occurring. If the writer is us, we can just ereport() to cause
+ 		 * a transaction abort. Otherwise we flag the writer for termination,
+ 		 * causing it to abort when it tries to commit. However, if the writer
+ 		 * is a prepared transaction, already prepared, we can't abort it
+ 		 * anymore, so we have to kill the reader instead.
+ 		 */
  		if (MySerializableXact == writer)
  		{
  			LWLockRelease(SerializableXactHashLock);
***************
*** 3999,4004 **** OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
--- 3965,3973 ----
  		else if (SxactIsPrepared(writer))
  		{
  			LWLockRelease(SerializableXactHashLock);
+ 
+ 			/* if we're not the writer, we have to be the reader */
+ 			Assert(MySerializableXact == reader);
  			ereport(ERROR,
  					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  					 errmsg("could not serialize access due to read/write dependencies among transactions"),
*** a/src/include/storage/predicate.h
--- b/src/include/storage/predicate.h
***************
*** 47,53 **** extern void RegisterPredicateLockingXid(const TransactionId xid);
  extern void PredicateLockRelation(const Relation relation);
  extern void PredicateLockPage(const Relation relation, const BlockNumber blkno);
  extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple);
- extern void PredicateLockTupleRowVersionLink(const Relation relation, const HeapTuple oldTuple, const HeapTuple newTuple);
  extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
  extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
  extern void ReleasePredicateLocks(const bool isCommit);
--- 47,52 ----
*** a/src/test/isolation/expected/multiple-row-versions.out
--- b/src/test/isolation/expected/multiple-row-versions.out
***************
*** 19,24 **** id             txt
  1                             
  step c4:  COMMIT; 
  step c3:  COMMIT; 
- ERROR:  could not serialize access due to read/write dependencies among transactions
  step wz1:  UPDATE t SET txt = 'a' WHERE id = 1; 
  step c1:  COMMIT; 
--- 19,24 ----
  1                             
  step c4:  COMMIT; 
  step c3:  COMMIT; 
  step wz1:  UPDATE t SET txt = 'a' WHERE id = 1; 
+ ERROR:  could not serialize access due to read/write dependencies among transactions
  step c1:  COMMIT; 
*** a/src/test/isolation/specs/multiple-row-versions.spec
--- b/src/test/isolation/specs/multiple-row-versions.spec
***************
*** 1,8 ****
  # Multiple Row Versions test
  #
! # This test is designed to ensure that predicate locks taken on one version
! # of a row are detected as conflicts when a later version of the row is
! # updated or deleted by a transaction concurrent to the reader.
  #
  # Due to long permutation setup time, we are only testing one specific
  # permutation, which should get a serialization error.
--- 1,7 ----
  # Multiple Row Versions test
  #
! # This test is designed to cover some code paths which only occur with
! # four or more transactions interacting with particular timings.
  #
  # Due to long permutation setup time, we are only testing one specific
  # permutation, which should get a serialization error.