wip-subquery-stats.patch

text/x-patch

Filename: wip-subquery-stats.patch
Type: text/x-patch
Part: 0
Message: Re: limit in subquery causes poor selectivity estimation
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index ef29374fcccae23cb663c04470f12c22321a0e2c..a703f4a910c0e5f521f09cf9564a05c73cf803b8 100644
*** a/src/backend/utils/adt/selfuncs.c
--- b/src/backend/utils/adt/selfuncs.c
***************
*** 95,100 ****
--- 95,101 ----
  #include "access/sysattr.h"
  #include "catalog/index.h"
  #include "catalog/pg_collation.h"
+ #include "catalog/pg_inherits_fn.h"
  #include "catalog/pg_opfamily.h"
  #include "catalog/pg_statistic.h"
  #include "catalog/pg_type.h"
*************** static double convert_one_bytea_to_scala
*** 168,173 ****
--- 169,176 ----
  							int rangelo, int rangehi);
  static char *convert_string_datum(Datum value, Oid typid);
  static double convert_timevalue_to_scalar(Datum value, Oid typid);
+ static void examine_variable_recurse(Query *subquery, AttrNumber resno,
+ 						 VariableStatData *vardata);
  static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata,
  				   Oid sortop, Datum *min, Datum *max);
  static bool get_actual_variable_range(PlannerInfo *root,
*************** examine_variable(PlannerInfo *root, Node
*** 4176,4196 ****
  		}
  		else if (rte->rtekind == RTE_RELATION)
  		{
  			vardata->statsTuple = SearchSysCache3(STATRELATTINH,
  												ObjectIdGetDatum(rte->relid),
  												Int16GetDatum(var->varattno),
  												  BoolGetDatum(rte->inh));
  			vardata->freefunc = ReleaseSysCache;
  		}
  		else
  		{
  			/*
! 			 * XXX This means the Var comes from a JOIN or sub-SELECT. Later
! 			 * add code to dig down into the join etc and see if we can trace
! 			 * the variable to something with stats.  (But beware of
! 			 * sub-SELECTs with DISTINCT/GROUP BY/etc.	Perhaps there are no
! 			 * cases where this would really be useful, because we'd have
! 			 * flattened the subselect if it is??)
  			 */
  		}
  
--- 4179,4205 ----
  		}
  		else if (rte->rtekind == RTE_RELATION)
  		{
+ 			/* plain table, so look up the column in pg_statistic */
  			vardata->statsTuple = SearchSysCache3(STATRELATTINH,
  												ObjectIdGetDatum(rte->relid),
  												Int16GetDatum(var->varattno),
  												  BoolGetDatum(rte->inh));
  			vardata->freefunc = ReleaseSysCache;
  		}
+ 		else if (rte->rtekind == RTE_SUBQUERY)
+ 		{
+ 			/* recurse into subquery */
+ 			examine_variable_recurse(rte->subquery, var->varattno,
+ 									 vardata);
+ 		}
  		else
  		{
  			/*
! 			 * Otherwise, the Var comes from a FUNCTION, VALUES, or CTE RTE.
! 			 * (We won't see RTE_JOIN here because join alias Vars have
! 			 * already been flattened.)  There's not much we can do with
! 			 * function outputs, but maybe someday try to be smarter about
! 			 * VALUES and/or CTEs.
  			 */
  		}
  
*************** examine_variable(PlannerInfo *root, Node
*** 4335,4340 ****
--- 4344,4435 ----
  }
  
  /*
+  * examine_variable_recurse
+  *		Handle a Var referring to a subquery result for examine_variable
+  *
+  * If the Var ultimately refers to a column of a table, we can fill in the
+  * statsTuple; none of the other fields of vardata need to be changed.
+  * (In particular, we don't try to set isunique, since even if the underlying
+  * column is unique, the subquery may have joined to other tables in a way
+  * that creates duplicates.)
+  *
+  * XXX For the moment, we ignore the possibility that the subquery may change
+  * the column's statistics so much that we'd be better off not consulting
+  * pg_statistic at all.  Possibly we should give up if the subquery uses
+  * GROUP BY or DISTINCT, for instance.
+  *
+  * The subquery we are looking at has not been through planner preprocessing,
+  * so some things have to be done differently here than in examine_variable.
+  * (Perhaps we should rearrange things so that the sub-root data structure
+  * is kept around after we plan a subquery?)
+  */
+ static void
+ examine_variable_recurse(Query *subquery, AttrNumber resno,
+ 						 VariableStatData *vardata)
+ {
+ 	TargetEntry *ste;
+ 	Var		   *var;
+ 	RangeTblEntry *rte;
+ 
+ 	/* Get the subquery output expression referenced by the upper Var */
+ 	ste = get_tle_by_resno(subquery->targetList, resno);
+ 	if (ste == NULL || ste->resjunk)
+ 		elog(ERROR, "subquery does not have attribute %d", resno);
+ 	var = (Var *) ste->expr;
+ 
+ restart:
+ 	/* Can only handle a simple Var of subquery's query level */
+ 	if (var && IsA(var, Var) &&
+ 		var->varlevelsup == 0)
+ 	{
+ 		/* OK, fetch the RTE referenced by the Var */
+ 		rte = rt_fetch(var->varno, subquery->rtable);
+ 
+ 		/* XXX Can't call get_relation_stats_hook for lack of subroot */
+ 
+ 		if (rte->rtekind == RTE_RELATION)
+ 		{
+ 			/*
+ 			 * We can't just rely on rte->inh here, because in parser output
+ 			 * that only signifies the absence of ONLY.  We have to check
+ 			 * relhassubclass as well.  For speed, we don't actually examine
+ 			 * pg_inherits, so we might get fooled by a table that once had
+ 			 * children but no longer does.  But that's okay, because ANALYZE
+ 			 * will still produce inheritance-tree statistics in such cases,
+ 			 * so we will still find an applicable pg_statistic entry.
+ 			 */
+ 			bool	inh;
+ 
+ 			inh = rte->inh && has_subclass(rte->relid);
+ 			vardata->statsTuple = SearchSysCache3(STATRELATTINH,
+ 												  ObjectIdGetDatum(rte->relid),
+ 												  Int16GetDatum(var->varattno),
+ 												  BoolGetDatum(inh));
+ 			vardata->freefunc = ReleaseSysCache;
+ 		}
+ 		else if (rte->rtekind == RTE_SUBQUERY)
+ 		{
+ 			/* recurse some more ... */
+ 			examine_variable_recurse(rte->subquery, var->varattno,
+ 									 vardata);
+ 		}
+ 		else if (rte->rtekind == RTE_JOIN)
+ 		{
+ 			/*
+ 			 * Since join alias variables haven't been flattened in the
+ 			 * subquery, we have to be prepared to dereference them.
+ 			 */
+ 			if (var->varattno == InvalidAttrNumber)
+ 				return;			/* punt on whole-row reference */
+ 			Assert(var->varattno > 0);
+ 			var = (Var *) list_nth(rte->joinaliasvars, var->varattno - 1);
+ 			goto restart;
+ 		}
+ 		/* As in examine_variable, do nothing for other RTE kinds */
+ 	}
+ }
+ 
+ /*
   * get_variable_numdistinct
   *	  Estimate the number of distinct values of a variable.
   *