fast_makesign_v2.patch

text/x-patch
Filename: fast_makesign_v2.patch
Type: text/x-patch
Part: 0
Message: Re: Optimizing pg_trgm makesign() (was Re: WIP: Fast GiST index build)
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index b328a09f41fee50beb96a28835e15ef835222cd6..7cea6d68fdcde71e0fb033682d77c21978c3406b 100644
*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
*************** gtrgm_out(PG_FUNCTION_ARGS)
*** 84,100 ****
  static void
  makesign(BITVECP sign, TRGM *a)
  {
! 	int4		k,
! 				len = ARRNELEM(a);
  	trgm	   *ptr = GETARR(a);
! 	int4		tmp = 0;
  
  	MemSet((void *) sign, 0, sizeof(BITVEC));
  	SETBIT(sign, SIGLENBIT);	/* set last unused bit */
! 	for (k = 0; k < len; k++)
  	{
! 		CPTRGM(((char *) &tmp), ptr + k);
! 		HASH(sign, tmp);
  	}
  }
  
--- 84,178 ----
  static void
  makesign(BITVECP sign, TRGM *a)
  {
! 	int4		len = ARRNELEM(a);
  	trgm	   *ptr = GETARR(a);
! 	char	   *p;
! 	char	   *endptr;
! 	uint32		w1,
! 				w2,
! 				w3;
! 	uint32		trg0 = 0,
! 				trg1,
! 				trg2,
! 				trg3,
! 				trg4;
! 	uint32	   *p32;
  
  	MemSet((void *) sign, 0, sizeof(BITVEC));
  	SETBIT(sign, SIGLENBIT);	/* set last unused bit */
! 
! 	if (len <= 0)
! 		return;
! 
! 	/*----------
! 	 * We have to extract each trigram into a uint32, and calculate the HASH.
! 	 * This would be a lot easier if the trigrams were aligned on 4-byte
! 	 * boundaries, but they're not.  The simple way would be to copy each
! 	 * trigram byte-by-byte, but that is quite slow, and this function is a
! 	 * hotspot in penalty calculations.
! 	 *
! 	 * The first trigram in the array doesn't begin at a 4-byte boundary, as
! 	 * the flags byte comes first; but the next one does.  So we fetch the
! 	 * first trigram as a special case, and after that each four trigrams fall
! 	 * onto 4-byte words like this:
! 	 *
! 	 *  w1   w2   w3
! 	 * AAAB BBCC CDDD
! 	 *
! 	 * As long as there's at least four trigrams left to process, we fetch
! 	 * the next three words and extract the trigrams from them with bit
! 	 * operations, per the above diagram.  The last few trigrams are handled
! 	 * one at a time with byte-by-byte fetching.
! 	 *
! 	 * Note that this code yields different results on big-endian and
! 	 * little-endian machines, because the bytes of each trigram are loaded
! 	 * into a uint32 in memory order and left-justified.  That's probably
! 	 * undesirable, but changing this behavior would break existing indexes.
! 	 *----------
! 	 */
! 	endptr = (char *) (ptr + len);
! 	p32 = (uint32 *) (((char *) ptr) - 1);
! 
! 	/* Fetch and extract the initial word */
! 	w1 = *(p32++);
! #ifdef WORDS_BIGENDIAN
! 	trg1 = w1 << 8;
! #else
! 	trg1 = w1 >> 8;
! #endif
! 	HASH(sign, trg1);
! 
! 	while ((char *) p32 <= endptr - 3 * sizeof(uint32))
  	{
! 		w1 = *(p32++);
! 		w2 = *(p32++);
! 		w3 = *(p32++);
! 
! #ifdef WORDS_BIGENDIAN
! 		trg1 = w1 & 0xFFFFFF00;
! 		trg2 = (w1 << 24) | ((w2 & 0xFFFF0000) >> 8);
! 		trg3 = ((w2 & 0x0000FFFF) << 16) | ((w3 & 0xFF000000) >> 16);
! 		trg4 = w3 << 8;
! #else
! 		trg1 = w1 & 0x00FFFFFF;
! 		trg2 = (w1 >> 24) | ((w2 & 0x0000FFFF) << 8);
! 		trg3 = ((w2 & 0xFFFF0000) >> 16) | ((w3 & 0x000000FF) << 16);
! 		trg4 = w3 >> 8;
! #endif
! 
! 		HASH(sign, trg1);
! 		HASH(sign, trg2);
! 		HASH(sign, trg3);
! 		HASH(sign, trg4);
! 	}
! 
! 	/* Handle the remaining 0-3 trigrams the slow way */
! 	p = (char *) p32;
! 	while (p < endptr)
! 	{
! 		CPTRGM(((char *) &trg0), p);
! 		HASH(sign, trg0);
! 		p += 3;
  	}
  }