From a57d312a7706321d850faa048a562a0c0c01b835 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 22 Jul 2020 19:19:44 -0400
Subject: [PATCH 01/54] Support infinity and -infinity in the numeric data
 type.

Add infinities that behave the same as they do in the floating-point
data types.  Aside from any intrinsic usefulness these may have,
this closes an important gap in our ability to convert floating
values to numeric and/or replace float-based APIs with numeric.

The new values are represented by bit patterns that were formerly
not used (although old code probably would take them for NaNs).
So there shouldn't be any pg_upgrade hazard.

Patch by me, reviewed by Dean Rasheed and Andrew Gierth

Discussion: https://postgr.es/m/606717.1591924582@sss.pgh.pa.us
---
 contrib/jsonb_plperl/jsonb_plperl.c      |    6 +-
 contrib/jsonb_plpython/jsonb_plpython.c  |    9 +-
 doc/src/sgml/datatype.sgml               |   75 +-
 src/backend/utils/adt/formatting.c       |    9 +-
 src/backend/utils/adt/numeric.c          | 1474 +++++++++++++++++-----
 src/include/utils/numeric.h              |    1 +
 src/test/regress/expected/aggregates.out |   86 +-
 src/test/regress/expected/numeric.out    |  680 +++++++++-
 src/test/regress/expected/window.out     |   66 +-
 src/test/regress/sql/aggregates.sql      |   22 +-
 src/test/regress/sql/numeric.sql         |  187 ++-
 src/test/regress/sql/window.sql          |   16 +-
 12 files changed, 2251 insertions(+), 380 deletions(-)
diff --git a/contrib/jsonb_plperl/jsonb_plperl.c b/contrib/jsonb_plperl/jsonb_plperl.c
index ed361efbe2023..b81ba54b809dc 100644
--- a/contrib/jsonb_plperl/jsonb_plperl.c
+++ b/contrib/jsonb_plperl/jsonb_plperl.c
@@ -227,10 +227,8 @@ SV_to_JsonbValue(SV *in, JsonbParseState **jsonb_state, bool is_elem)
 				/*
 				 * jsonb doesn't allow infinity or NaN (per JSON
 				 * specification), but the numeric type that is used for the
-				 * storage accepts NaN, so we have to prevent it here
-				 * explicitly.  We don't really have to check for isinf()
-				 * here, as numeric doesn't allow it and it would be caught
-				 * later, but it makes for a nicer error message.
+				 * storage accepts those, so we have to reject them here
+				 * explicitly.
 				 */
 				if (isinf(nval))
 					ereport(ERROR,
diff --git a/contrib/jsonb_plpython/jsonb_plpython.c b/contrib/jsonb_plpython/jsonb_plpython.c
index e09308daf07fa..836c17877065c 100644
--- a/contrib/jsonb_plpython/jsonb_plpython.c
+++ b/contrib/jsonb_plpython/jsonb_plpython.c
@@ -387,14 +387,17 @@ PLyNumber_ToJsonbValue(PyObject *obj, JsonbValue *jbvNum)
 	pfree(str);
 
 	/*
-	 * jsonb doesn't allow NaN (per JSON specification), so we have to prevent
-	 * it here explicitly.  (Infinity is also not allowed in jsonb, but
-	 * numeric_in above already catches that.)
+	 * jsonb doesn't allow NaN or infinity (per JSON specification), so we
+	 * have to reject those here explicitly.
 	 */
 	if (numeric_is_nan(num))
 		ereport(ERROR,
 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
 				 errmsg("cannot convert NaN to jsonb")));
+	if (numeric_is_inf(num))
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("cannot convert infinity to jsonb")));
 
 	jbvNum->type = jbvNumeric;
 	jbvNum->val.numeric = num;
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 7027758d28dd7..50e370cae4405 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -554,9 +554,9 @@ NUMERIC(<replaceable>precision</replaceable>)
 <programlisting>
 NUMERIC
 </programlisting>
-     without any precision or scale creates a column in which numeric
-     values of any precision and scale can be stored, up to the
-     implementation limit on precision.  A column of this kind will
+     without any precision or scale creates an <quote>unconstrained
+     numeric</quote> column in which numeric values of any length can be
+     stored, up to the implementation limits.  A column of this kind will
      not coerce input values to any particular scale, whereas
      <type>numeric</type> columns with a declared scale will coerce
      input values to that scale.  (The <acronym>SQL</acronym> standard
@@ -568,10 +568,10 @@ NUMERIC
 
     <note>
      <para>
-      The maximum allowed precision when explicitly specified in the
-      type declaration is 1000; <type>NUMERIC</type> without a specified
-      precision is subject to the limits described in <xref
-      linkend="datatype-numeric-table"/>.
+      The maximum precision that can be explicitly specified in
+      a <type>NUMERIC</type> type declaration is 1000.  An
+      unconstrained <type>NUMERIC</type> column is subject to the limits
+      described in <xref linkend="datatype-numeric-table"/>.
      </para>
     </note>
 
@@ -593,6 +593,11 @@ NUMERIC
      plus three to eight bytes overhead.
     </para>
 
+    <indexterm>
+     <primary>infinity</primary>
+     <secondary>numeric (data type)</secondary>
+    </indexterm>
+
     <indexterm>
      <primary>NaN</primary>
      <see>not a number</see>
@@ -604,13 +609,44 @@ NUMERIC
     </indexterm>
 
     <para>
-     In addition to ordinary numeric values, the <type>numeric</type>
-     type allows the special value <literal>NaN</literal>, meaning
-     <quote>not-a-number</quote>.  Any operation on <literal>NaN</literal>
-     yields another <literal>NaN</literal>.  When writing this value
-     as a constant in an SQL command, you must put quotes around it,
-     for example <literal>UPDATE table SET x = 'NaN'</literal>.  On input,
-     the string <literal>NaN</literal> is recognized in a case-insensitive manner.
+     In addition to ordinary numeric values, the <type>numeric</type> type
+     has several special values:
+<literallayout>
+<literal>Infinity</literal>
+<literal>-Infinity</literal>
+<literal>NaN</literal>
+</literallayout>
+     These are adapted from the IEEE 754 standard, and represent
+     <quote>infinity</quote>, <quote>negative infinity</quote>, and
+     <quote>not-a-number</quote>, respectively. When writing these values
+     as constants in an SQL command, you must put quotes around them,
+     for example <literal>UPDATE table SET x = '-Infinity'</literal>.
+     On input, these strings are recognized in a case-insensitive manner.
+     The infinity values can alternatively be spelled <literal>inf</literal>
+     and <literal>-inf</literal>.
+    </para>
+
+    <para>
+     The infinity values behave as per mathematical expectations.  For
+     example, <literal>Infinity</literal> plus any finite value equals
+     <literal>Infinity</literal>, as does <literal>Infinity</literal>
+     plus <literal>Infinity</literal>; but <literal>Infinity</literal>
+     minus <literal>Infinity</literal> yields <literal>NaN</literal> (not a
+     number), because it has no well-defined interpretation.  Note that an
+     infinity can only be stored in an unconstrained <type>numeric</type>
+     column, because it notionally exceeds any finite precision limit.
+    </para>
+
+    <para>
+     The <literal>NaN</literal> (not a number) value is used to represent
+     undefined calculational results.  In general, any operation with
+     a <literal>NaN</literal> input yields another <literal>NaN</literal>.
+     The only exception is when the operation's other inputs are such that
+     the same output would be obtained if the <literal>NaN</literal> were to
+     be replaced by any finite or infinite numeric value; then, that output
+     value is used for <literal>NaN</literal> too.  (An example of this
+     principle is that <literal>NaN</literal> raised to the zero power
+     yields one.)
     </para>
 
     <note>
@@ -781,9 +817,14 @@ FROM generate_series(-3.5, 3.5, 1) as x;
      </para>
     </note>
 
+    <indexterm>
+     <primary>infinity</primary>
+     <secondary>floating point</secondary>
+    </indexterm>
+
     <indexterm>
      <primary>not a number</primary>
-     <secondary>double precision</secondary>
+     <secondary>floating point</secondary>
     </indexterm>
 
     <para>
@@ -800,11 +841,13 @@ FROM generate_series(-3.5, 3.5, 1) as x;
      as constants in an SQL command, you must put quotes around them,
      for example <literal>UPDATE table SET x = '-Infinity'</literal>.  On input,
      these strings are recognized in a case-insensitive manner.
+     The infinity values can alternatively be spelled <literal>inf</literal>
+     and <literal>-inf</literal>.
     </para>
 
     <note>
      <para>
-      IEEE754 specifies that <literal>NaN</literal> should not compare equal
+      IEEE 754 specifies that <literal>NaN</literal> should not compare equal
       to any other floating-point value (including <literal>NaN</literal>).
       In order to allow floating-point values to be sorted and used
       in tree-based indexes, <productname>PostgreSQL</productname> treats
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 16768b28c30e9..662643813660d 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -6129,9 +6129,12 @@ numeric_to_char(PG_FUNCTION_ARGS)
 		/*
 		 * numeric_out_sci() does not emit a sign for positive numbers.  We
 		 * need to add a space in this case so that positive and negative
-		 * numbers are aligned.  We also have to do the right thing for NaN.
+		 * numbers are aligned.  Also must check for NaN/infinity cases, which
+		 * we handle the same way as in float8_to_char.
 		 */
-		if (strcmp(orgnum, "NaN") == 0)
+		if (strcmp(orgnum, "NaN") == 0 ||
+			strcmp(orgnum, "Infinity") == 0 ||
+			strcmp(orgnum, "-Infinity") == 0)
 		{
 			/*
 			 * Allow 6 characters for the leading sign, the decimal point,
@@ -6346,7 +6349,7 @@ int8_to_char(PG_FUNCTION_ARGS)
 		/*
 		 * numeric_out_sci() does not emit a sign for positive numbers.  We
 		 * need to add a space in this case so that positive and negative
-		 * numbers are aligned.  We don't have to worry about NaN here.
+		 * numbers are aligned.  We don't have to worry about NaN/inf here.
 		 */
 		if (*orgnum != '-')
 		{
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 1773fa292e49e..ed825a1fddf97 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -109,14 +109,13 @@ typedef int16 NumericDigit;
  * If the high bits of the first word of a NumericChoice (n_header, or
  * n_short.n_header, or n_long.n_sign_dscale) are NUMERIC_SHORT, then the
  * numeric follows the NumericShort format; if they are NUMERIC_POS or
- * NUMERIC_NEG, it follows the NumericLong format.  If they are NUMERIC_NAN,
- * it is a NaN.  We currently always store a NaN using just two bytes (i.e.
- * only n_header), but previous releases used only the NumericLong format,
- * so we might find 4-byte NaNs on disk if a database has been migrated using
- * pg_upgrade.  In either case, when the high bits indicate a NaN, the
- * remaining bits are never examined.  Currently, we always initialize these
- * to zero, but it might be possible to use them for some other purpose in
- * the future.
+ * NUMERIC_NEG, it follows the NumericLong format. If they are NUMERIC_SPECIAL,
+ * the value is a NaN or Infinity.  We currently always store SPECIAL values
+ * using just two bytes (i.e. only n_header), but previous releases used only
+ * the NumericLong format, so we might find 4-byte NaNs (though not infinities)
+ * on disk if a database has been migrated using pg_upgrade.  In either case,
+ * the low-order bits of a special value's header are reserved and currently
+ * should always be set to zero.
  *
  * In the NumericShort format, the remaining 14 bits of the header word
  * (n_short.n_header) are allocated as follows: 1 for sign (positive or
@@ -168,25 +167,47 @@ struct NumericData
 #define NUMERIC_POS			0x0000
 #define NUMERIC_NEG			0x4000
 #define NUMERIC_SHORT		0x8000
-#define NUMERIC_NAN			0xC000
+#define NUMERIC_SPECIAL		0xC000
 
 #define NUMERIC_FLAGBITS(n) ((n)->choice.n_header & NUMERIC_SIGN_MASK)
-#define NUMERIC_IS_NAN(n)		(NUMERIC_FLAGBITS(n) == NUMERIC_NAN)
 #define NUMERIC_IS_SHORT(n)		(NUMERIC_FLAGBITS(n) == NUMERIC_SHORT)
+#define NUMERIC_IS_SPECIAL(n)	(NUMERIC_FLAGBITS(n) == NUMERIC_SPECIAL)
 
 #define NUMERIC_HDRSZ	(VARHDRSZ + sizeof(uint16) + sizeof(int16))
 #define NUMERIC_HDRSZ_SHORT (VARHDRSZ + sizeof(uint16))
 
 /*
- * If the flag bits are NUMERIC_SHORT or NUMERIC_NAN, we want the short header;
- * otherwise, we want the long one.  Instead of testing against each value, we
- * can just look at the high bit, for a slight efficiency gain.
+ * If the flag bits are NUMERIC_SHORT or NUMERIC_SPECIAL, we want the short
+ * header; otherwise, we want the long one.  Instead of testing against each
+ * value, we can just look at the high bit, for a slight efficiency gain.
  */
 #define NUMERIC_HEADER_IS_SHORT(n)	(((n)->choice.n_header & 0x8000) != 0)
 #define NUMERIC_HEADER_SIZE(n) \
 	(VARHDRSZ + sizeof(uint16) + \
 	 (NUMERIC_HEADER_IS_SHORT(n) ? 0 : sizeof(int16)))
 
+/*
+ * Definitions for special values (NaN, positive infinity, negative infinity).
+ *
+ * The two bits after the NUMERIC_SPECIAL bits are 00 for NaN, 01 for positive
+ * infinity, 11 for negative infinity.  (This makes the sign bit match where
+ * it is in a short-format value, though we make no use of that at present.)
+ * We could mask off the remaining bits before testing the active bits, but
+ * currently those bits must be zeroes, so masking would just add cycles.
+ */
+#define NUMERIC_EXT_SIGN_MASK	0xF000	/* high bits plus NaN/Inf flag bits */
+#define NUMERIC_NAN				0xC000
+#define NUMERIC_PINF			0xD000
+#define NUMERIC_NINF			0xF000
+#define NUMERIC_INF_SIGN_MASK	0x2000
+
+#define NUMERIC_EXT_FLAGBITS(n)	((n)->choice.n_header & NUMERIC_EXT_SIGN_MASK)
+#define NUMERIC_IS_NAN(n)		((n)->choice.n_header == NUMERIC_NAN)
+#define NUMERIC_IS_PINF(n)		((n)->choice.n_header == NUMERIC_PINF)
+#define NUMERIC_IS_NINF(n)		((n)->choice.n_header == NUMERIC_NINF)
+#define NUMERIC_IS_INF(n) \
+	(((n)->choice.n_header & ~NUMERIC_INF_SIGN_MASK) == NUMERIC_PINF)
+
 /*
  * Short format definitions.
  */
@@ -202,7 +223,13 @@ struct NumericData
 #define NUMERIC_SHORT_WEIGHT_MIN		(-(NUMERIC_SHORT_WEIGHT_MASK+1))
 
 /*
- * Extract sign, display scale, weight.
+ * Extract sign, display scale, weight.  These macros extract field values
+ * suitable for the NumericVar format from the Numeric (on-disk) format.
+ *
+ * Note that we don't trouble to ensure that dscale and weight read as zero
+ * for an infinity; however, that doesn't matter since we never convert
+ * "special" numerics to NumericVar form.  Only the constants defined below
+ * (const_nan, etc) ever represent a non-finite value as a NumericVar.
  */
 
 #define NUMERIC_DSCALE_MASK			0x3FFF
@@ -210,7 +237,9 @@ struct NumericData
 #define NUMERIC_SIGN(n) \
 	(NUMERIC_IS_SHORT(n) ? \
 		(((n)->choice.n_short.n_header & NUMERIC_SHORT_SIGN_MASK) ? \
-		NUMERIC_NEG : NUMERIC_POS) : NUMERIC_FLAGBITS(n))
+		 NUMERIC_NEG : NUMERIC_POS) : \
+		(NUMERIC_IS_SPECIAL(n) ? \
+		 NUMERIC_EXT_FLAGBITS(n) : NUMERIC_FLAGBITS(n)))
 #define NUMERIC_DSCALE(n)	(NUMERIC_HEADER_IS_SHORT((n)) ? \
 	((n)->choice.n_short.n_header & NUMERIC_SHORT_DSCALE_MASK) \
 		>> NUMERIC_SHORT_DSCALE_SHIFT \
@@ -227,7 +256,9 @@ struct NumericData
  * complex.
  *
  * The value represented by a NumericVar is determined by the sign, weight,
- * ndigits, and digits[] array.
+ * ndigits, and digits[] array.  If it is a "special" value (NaN or Inf)
+ * then only the sign field matters; ndigits should be zero, and the weight
+ * and dscale fields are ignored.
  *
  * Note: the first digit of a NumericVar's value is assumed to be multiplied
  * by NBASE ** weight.  Another way to say it is that there are weight+1
@@ -274,7 +305,7 @@ typedef struct NumericVar
 {
 	int			ndigits;		/* # of digits in digits[] - can be 0! */
 	int			weight;			/* weight of first digit */
-	int			sign;			/* NUMERIC_POS, NUMERIC_NEG, or NUMERIC_NAN */
+	int			sign;			/* NUMERIC_POS, _NEG, _NAN, _PINF, or _NINF */
 	int			dscale;			/* display scale */
 	NumericDigit *buf;			/* start of palloc'd space for digits[] */
 	NumericDigit *digits;		/* base-NBASE digits */
@@ -354,16 +385,26 @@ typedef struct NumericSumAccum
  * representations for numeric values in order to avoid depending on
  * USE_FLOAT8_BYVAL.  The type of abbreviation we use is based only on
  * the size of a datum, not the argument-passing convention for float8.
+ *
+ * The range of abbreviations for finite values is from +PG_INT64/32_MAX
+ * to -PG_INT64/32_MAX.  NaN has the abbreviation PG_INT64/32_MIN, and we
+ * define the sort ordering to make that work out properly (see further
+ * comments below).  PINF and NINF share the abbreviations of the largest
+ * and smallest finite abbreviation classes.
  */
 #define NUMERIC_ABBREV_BITS (SIZEOF_DATUM * BITS_PER_BYTE)
 #if SIZEOF_DATUM == 8
 #define NumericAbbrevGetDatum(X) ((Datum) (X))
 #define DatumGetNumericAbbrev(X) ((int64) (X))
 #define NUMERIC_ABBREV_NAN		 NumericAbbrevGetDatum(PG_INT64_MIN)
+#define NUMERIC_ABBREV_PINF		 NumericAbbrevGetDatum(-PG_INT64_MAX)
+#define NUMERIC_ABBREV_NINF		 NumericAbbrevGetDatum(PG_INT64_MAX)
 #else
 #define NumericAbbrevGetDatum(X) ((Datum) (X))
 #define DatumGetNumericAbbrev(X) ((int32) (X))
 #define NUMERIC_ABBREV_NAN		 NumericAbbrevGetDatum(PG_INT32_MIN)
+#define NUMERIC_ABBREV_PINF		 NumericAbbrevGetDatum(-PG_INT32_MAX)
+#define NUMERIC_ABBREV_NINF		 NumericAbbrevGetDatum(PG_INT32_MAX)
 #endif
 
 
@@ -379,6 +420,9 @@ static const NumericDigit const_one_data[1] = {1};
 static const NumericVar const_one =
 {1, 0, NUMERIC_POS, 0, NULL, (NumericDigit *) const_one_data};
 
+static const NumericVar const_minus_one =
+{1, 0, NUMERIC_NEG, 0, NULL, (NumericDigit *) const_one_data};
+
 static const NumericDigit const_two_data[1] = {2};
 static const NumericVar const_two =
 {1, 0, NUMERIC_POS, 0, NULL, (NumericDigit *) const_two_data};
@@ -416,6 +460,12 @@ static const NumericVar const_one_point_one =
 static const NumericVar const_nan =
 {0, 0, NUMERIC_NAN, 0, NULL, NULL};
 
+static const NumericVar const_pinf =
+{0, 0, NUMERIC_PINF, 0, NULL, NULL};
+
+static const NumericVar const_ninf =
+{0, 0, NUMERIC_NINF, 0, NULL, NULL};
+
 #if DEC_DIGITS == 4
 static const int round_powers[4] = {0, 1000, 100, 10};
 #endif
@@ -465,10 +515,12 @@ static void set_var_from_var(const NumericVar *value, NumericVar *dest);
 static char *get_str_from_var(const NumericVar *var);
 static char *get_str_from_var_sci(const NumericVar *var, int rscale);
 
+static Numeric duplicate_numeric(Numeric num);
 static Numeric make_result(const NumericVar *var);
 static Numeric make_result_opt_error(const NumericVar *var, bool *error);
 
 static void apply_typmod(NumericVar *var, int32 typmod);
+static void apply_typmod_special(Numeric num, int32 typmod);
 
 static bool numericvar_to_int32(const NumericVar *var, int32 *result);
 static bool numericvar_to_int64(const NumericVar *var, int64 *result);
@@ -478,7 +530,6 @@ static bool numericvar_to_uint64(const NumericVar *var, uint64 *result);
 static bool numericvar_to_int128(const NumericVar *var, int128 *result);
 static void int128_to_numericvar(int128 val, NumericVar *var);
 #endif
-static double numeric_to_double_no_overflow(Numeric num);
 static double numericvar_to_double_no_overflow(const NumericVar *var);
 
 static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup);
@@ -587,23 +638,43 @@ numeric_in(PG_FUNCTION_ARGS)
 	}
 
 	/*
-	 * Check for NaN
+	 * Check for NaN and infinities.  We recognize the same strings allowed by
+	 * float8in().
 	 */
 	if (pg_strncasecmp(cp, "NaN", 3) == 0)
 	{
 		res = make_result(&const_nan);
-
-		/* Should be nothing left but spaces */
 		cp += 3;
-		while (*cp)
-		{
-			if (!isspace((unsigned char) *cp))
-				ereport(ERROR,
-						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						 errmsg("invalid input syntax for type %s: \"%s\"",
-								"numeric", str)));
-			cp++;
-		}
+	}
+	else if (pg_strncasecmp(cp, "Infinity", 8) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 8;
+	}
+	else if (pg_strncasecmp(cp, "+Infinity", 9) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 9;
+	}
+	else if (pg_strncasecmp(cp, "-Infinity", 9) == 0)
+	{
+		res = make_result(&const_ninf);
+		cp += 9;
+	}
+	else if (pg_strncasecmp(cp, "inf", 3) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 3;
+	}
+	else if (pg_strncasecmp(cp, "+inf", 4) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 4;
+	}
+	else if (pg_strncasecmp(cp, "-inf", 4) == 0)
+	{
+		res = make_result(&const_ninf);
+		cp += 4;
 	}
 	else
 	{
@@ -620,7 +691,7 @@ numeric_in(PG_FUNCTION_ARGS)
 		 * We duplicate a few lines of code here because we would like to
 		 * throw any trailing-junk syntax error before any semantic error
 		 * resulting from apply_typmod.  We can't easily fold the two cases
-		 * together because we mustn't apply apply_typmod to a NaN.
+		 * together because we mustn't apply apply_typmod to a NaN/Inf.
 		 */
 		while (*cp)
 		{
@@ -636,8 +707,24 @@ numeric_in(PG_FUNCTION_ARGS)
 
 		res = make_result(&value);
 		free_var(&value);
+
+		PG_RETURN_NUMERIC(res);
 	}
 
+	/* Should be nothing left but spaces */
+	while (*cp)
+	{
+		if (!isspace((unsigned char) *cp))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s: \"%s\"",
+							"numeric", str)));
+		cp++;
+	}
+
+	/* As above, throw any typmod error after finishing syntax check */
+	apply_typmod_special(res, typmod);
+
 	PG_RETURN_NUMERIC(res);
 }
 
@@ -655,10 +742,17 @@ numeric_out(PG_FUNCTION_ARGS)
 	char	   *str;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_CSTRING(pstrdup("NaN"));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			PG_RETURN_CSTRING(pstrdup("Infinity"));
+		else if (NUMERIC_IS_NINF(num))
+			PG_RETURN_CSTRING(pstrdup("-Infinity"));
+		else
+			PG_RETURN_CSTRING(pstrdup("NaN"));
+	}
 
 	/*
 	 * Get the number in the variable format.
@@ -681,6 +775,41 @@ numeric_is_nan(Numeric num)
 	return NUMERIC_IS_NAN(num);
 }
 
+/*
+ * numeric_is_inf() -
+ *
+ *	Is Numeric value an infinity?
+ */
+bool
+numeric_is_inf(Numeric num)
+{
+	return NUMERIC_IS_INF(num);
+}
+
+/*
+ * numeric_is_integral() -
+ *
+ *	Is Numeric value integral?
+ */
+static bool
+numeric_is_integral(Numeric num)
+{
+	NumericVar	arg;
+
+	/* Reject NaN, but infinities are considered integral */
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			return false;
+		return true;
+	}
+
+	/* Integral if there are no digits to the right of the decimal point */
+	init_var_from_num(num, &arg);
+
+	return (arg.ndigits == 0 || arg.ndigits <= arg.weight + 1);
+}
+
 /*
  * numeric_maximum_size() -
  *
@@ -732,10 +861,17 @@ numeric_out_sci(Numeric num, int scale)
 	char	   *str;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		return pstrdup("NaN");
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			return pstrdup("Infinity");
+		else if (NUMERIC_IS_NINF(num))
+			return pstrdup("-Infinity");
+		else
+			return pstrdup("NaN");
+	}
 
 	init_var_from_num(num, &x);
 
@@ -760,10 +896,17 @@ numeric_normalize(Numeric num)
 	int			last;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		return pstrdup("NaN");
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			return pstrdup("Infinity");
+		else if (NUMERIC_IS_NINF(num))
+			return pstrdup("-Infinity");
+		else
+			return pstrdup("NaN");
+	}
 
 	init_var_from_num(num, &x);
 
@@ -823,7 +966,9 @@ numeric_recv(PG_FUNCTION_ARGS)
 	value.sign = (uint16) pq_getmsgint(buf, sizeof(uint16));
 	if (!(value.sign == NUMERIC_POS ||
 		  value.sign == NUMERIC_NEG ||
-		  value.sign == NUMERIC_NAN))
+		  value.sign == NUMERIC_NAN ||
+		  value.sign == NUMERIC_PINF ||
+		  value.sign == NUMERIC_NINF))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
 				 errmsg("invalid sign in external \"numeric\" value")));
@@ -849,13 +994,29 @@ numeric_recv(PG_FUNCTION_ARGS)
 	 * If the given dscale would hide any digits, truncate those digits away.
 	 * We could alternatively throw an error, but that would take a bunch of
 	 * extra code (about as much as trunc_var involves), and it might cause
-	 * client compatibility issues.
+	 * client compatibility issues.  Be careful not to apply trunc_var to
+	 * special values, as it could do the wrong thing; we don't need it
+	 * anyway, since make_result will ignore all but the sign field.
+	 *
+	 * After doing that, be sure to check the typmod restriction.
 	 */
-	trunc_var(&value, value.dscale);
+	if (value.sign == NUMERIC_POS ||
+		value.sign == NUMERIC_NEG)
+	{
+		trunc_var(&value, value.dscale);
 
-	apply_typmod(&value, typmod);
+		apply_typmod(&value, typmod);
+
+		res = make_result(&value);
+	}
+	else
+	{
+		/* apply_typmod_special wants us to make the Numeric first */
+		res = make_result(&value);
+
+		apply_typmod_special(res, typmod);
+	}
 
-	res = make_result(&value);
 	free_var(&value);
 
 	PG_RETURN_NUMERIC(res);
@@ -961,21 +1122,21 @@ numeric		(PG_FUNCTION_ARGS)
 	NumericVar	var;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities: if apply_typmod_special doesn't complain,
+	 * just return a copy of the input.
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		apply_typmod_special(num, typmod);
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	/*
 	 * If the value isn't a valid type modifier, simply return a copy of the
 	 * input value
 	 */
 	if (typmod < (int32) (VARHDRSZ))
-	{
-		new = (Numeric) palloc(VARSIZE(num));
-		memcpy(new, num, VARSIZE(num));
-		PG_RETURN_NUMERIC(new);
-	}
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Get the precision and scale out of the typmod value
@@ -997,8 +1158,7 @@ numeric		(PG_FUNCTION_ARGS)
 		&& (NUMERIC_CAN_BE_SHORT(scale, NUMERIC_WEIGHT(num))
 			|| !NUMERIC_IS_SHORT(num)))
 	{
-		new = (Numeric) palloc(VARSIZE(num));
-		memcpy(new, num, VARSIZE(num));
+		new = duplicate_numeric(num);
 		if (NUMERIC_IS_SHORT(num))
 			new->choice.n_short.n_header =
 				(num->choice.n_short.n_header & ~NUMERIC_SHORT_DSCALE_MASK)
@@ -1099,21 +1259,20 @@ numeric_abs(PG_FUNCTION_ARGS)
 	Numeric		num = PG_GETARG_NUMERIC(0);
 	Numeric		res;
 
-	/*
-	 * Handle NaN
-	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-
 	/*
 	 * Do it the easy way directly on the packed format
 	 */
-	res = (Numeric) palloc(VARSIZE(num));
-	memcpy(res, num, VARSIZE(num));
+	res = duplicate_numeric(num);
 
 	if (NUMERIC_IS_SHORT(num))
 		res->choice.n_short.n_header =
 			num->choice.n_short.n_header & ~NUMERIC_SHORT_SIGN_MASK;
+	else if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* This changes -Inf to Inf, and doesn't affect NaN */
+		res->choice.n_short.n_header =
+			num->choice.n_short.n_header & ~NUMERIC_INF_SIGN_MASK;
+	}
 	else
 		res->choice.n_long.n_sign_dscale = NUMERIC_POS | NUMERIC_DSCALE(num);
 
@@ -1127,24 +1286,25 @@ numeric_uminus(PG_FUNCTION_ARGS)
 	Numeric		num = PG_GETARG_NUMERIC(0);
 	Numeric		res;
 
-	/*
-	 * Handle NaN
-	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-
 	/*
 	 * Do it the easy way directly on the packed format
 	 */
-	res = (Numeric) palloc(VARSIZE(num));
-	memcpy(res, num, VARSIZE(num));
+	res = duplicate_numeric(num);
+
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* Flip the sign, if it's Inf or -Inf */
+		if (!NUMERIC_IS_NAN(num))
+			res->choice.n_short.n_header =
+				num->choice.n_short.n_header ^ NUMERIC_INF_SIGN_MASK;
+	}
 
 	/*
 	 * The packed format is known to be totally zero digit trimmed always. So
-	 * we can identify a ZERO by the fact that there are no digits at all.  Do
-	 * nothing to a zero.
+	 * once we've eliminated specials, we can identify a zero by the fact that
+	 * there are no digits at all. Do nothing to a zero.
 	 */
-	if (NUMERIC_NDIGITS(num) != 0)
+	else if (NUMERIC_NDIGITS(num) != 0)
 	{
 		/* Else, flip the sign */
 		if (NUMERIC_IS_SHORT(num))
@@ -1166,12 +1326,42 @@ Datum
 numeric_uplus(PG_FUNCTION_ARGS)
 {
 	Numeric		num = PG_GETARG_NUMERIC(0);
-	Numeric		res;
 
-	res = (Numeric) palloc(VARSIZE(num));
-	memcpy(res, num, VARSIZE(num));
+	PG_RETURN_NUMERIC(duplicate_numeric(num));
+}
 
-	PG_RETURN_NUMERIC(res);
+
+/*
+ * numeric_sign_internal() -
+ *
+ * Returns -1 if the argument is less than 0, 0 if the argument is equal
+ * to 0, and 1 if the argument is greater than zero.  Caller must have
+ * taken care of the NaN case, but we can handle infinities here.
+ */
+static int
+numeric_sign_internal(Numeric num)
+{
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		Assert(!NUMERIC_IS_NAN(num));
+		/* Must be Inf or -Inf */
+		if (NUMERIC_IS_PINF(num))
+			return 1;
+		else
+			return -1;
+	}
+
+	/*
+	 * The packed format is known to be totally zero digit trimmed always. So
+	 * once we've eliminated specials, we can identify a zero by the fact that
+	 * there are no digits at all.
+	 */
+	else if (NUMERIC_NDIGITS(num) == 0)
+		return 0;
+	else if (NUMERIC_SIGN(num) == NUMERIC_NEG)
+		return -1;
+	else
+		return 1;
 }
 
 /*
@@ -1184,37 +1374,25 @@ Datum
 numeric_sign(PG_FUNCTION_ARGS)
 {
 	Numeric		num = PG_GETARG_NUMERIC(0);
-	Numeric		res;
-	NumericVar	result;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN (infinities can be handled normally)
 	 */
 	if (NUMERIC_IS_NAN(num))
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
-	init_var(&result);
-
-	/*
-	 * The packed format is known to be totally zero digit trimmed always. So
-	 * we can identify a ZERO by the fact that there are no digits at all.
-	 */
-	if (NUMERIC_NDIGITS(num) == 0)
-		set_var_from_var(&const_zero, &result);
-	else
+	switch (numeric_sign_internal(num))
 	{
-		/*
-		 * And if there are some, we return a copy of ONE with the sign of our
-		 * argument
-		 */
-		set_var_from_var(&const_one, &result);
-		result.sign = NUMERIC_SIGN(num);
+		case 0:
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+		case 1:
+			PG_RETURN_NUMERIC(make_result(&const_one));
+		case -1:
+			PG_RETURN_NUMERIC(make_result(&const_minus_one));
 	}
 
-	res = make_result(&result);
-	free_var(&result);
-
-	PG_RETURN_NUMERIC(res);
+	Assert(false);
+	return (Datum) 0;
 }
 
 
@@ -1234,10 +1412,10 @@ numeric_round(PG_FUNCTION_ARGS)
 	NumericVar	arg;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Limit the scale value to avoid possible overflow in calculations
@@ -1283,10 +1461,10 @@ numeric_trunc(PG_FUNCTION_ARGS)
 	NumericVar	arg;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Limit the scale value to avoid possible overflow in calculations
@@ -1328,8 +1506,11 @@ numeric_ceil(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	/*
+	 * Handle NaN and infinities
+	 */
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	init_var_from_num(num, &result);
 	ceil_var(&result, &result);
@@ -1353,8 +1534,11 @@ numeric_floor(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	/*
+	 * Handle NaN and infinities
+	 */
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	init_var_from_num(num, &result);
 	floor_var(&result, &result);
@@ -1390,26 +1574,46 @@ generate_series_step_numeric(PG_FUNCTION_ARGS)
 		Numeric		stop_num = PG_GETARG_NUMERIC(1);
 		NumericVar	steploc = const_one;
 
-		/* handle NaN in start and stop values */
-		if (NUMERIC_IS_NAN(start_num))
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("start value cannot be NaN")));
-
-		if (NUMERIC_IS_NAN(stop_num))
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("stop value cannot be NaN")));
+		/* Reject NaN and infinities in start and stop values */
+		if (NUMERIC_IS_SPECIAL(start_num))
+		{
+			if (NUMERIC_IS_NAN(start_num))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("start value cannot be NaN")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("start value cannot be infinity")));
+		}
+		if (NUMERIC_IS_SPECIAL(stop_num))
+		{
+			if (NUMERIC_IS_NAN(stop_num))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("stop value cannot be NaN")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("stop value cannot be infinity")));
+		}
 
 		/* see if we were given an explicit step size */
 		if (PG_NARGS() == 3)
 		{
 			Numeric		step_num = PG_GETARG_NUMERIC(2);
 
-			if (NUMERIC_IS_NAN(step_num))
-				ereport(ERROR,
-						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-						 errmsg("step size cannot be NaN")));
+			if (NUMERIC_IS_SPECIAL(step_num))
+			{
+				if (NUMERIC_IS_NAN(step_num))
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("step size cannot be NaN")));
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("step size cannot be infinity")));
+			}
 
 			init_var_from_num(step_num, &steploc);
 
@@ -1510,12 +1714,21 @@ width_bucket_numeric(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
 				 errmsg("count must be greater than zero")));
 
-	if (NUMERIC_IS_NAN(operand) ||
-		NUMERIC_IS_NAN(bound1) ||
-		NUMERIC_IS_NAN(bound2))
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
-				 errmsg("operand, lower bound, and upper bound cannot be NaN")));
+	if (NUMERIC_IS_SPECIAL(operand) ||
+		NUMERIC_IS_SPECIAL(bound1) ||
+		NUMERIC_IS_SPECIAL(bound2))
+	{
+		if (NUMERIC_IS_NAN(operand) ||
+			NUMERIC_IS_NAN(bound1) ||
+			NUMERIC_IS_NAN(bound2))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
+					 errmsg("operand, lower bound, and upper bound cannot be NaN")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
+					 errmsg("operand, lower bound, and upper bound cannot be infinity")));
+	}
 
 	init_var(&result_var);
 	init_var(&count_var);
@@ -1719,9 +1932,14 @@ numeric_abbrev_convert(Datum original_datum, SortSupport ssup)
 	else
 		value = (Numeric) original_varatt;
 
-	if (NUMERIC_IS_NAN(value))
+	if (NUMERIC_IS_SPECIAL(value))
 	{
-		result = NUMERIC_ABBREV_NAN;
+		if (NUMERIC_IS_PINF(value))
+			result = NUMERIC_ABBREV_PINF;
+		else if (NUMERIC_IS_NINF(value))
+			result = NUMERIC_ABBREV_NINF;
+		else
+			result = NUMERIC_ABBREV_NAN;
 	}
 	else
 	{
@@ -1847,7 +2065,7 @@ numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
 {
 	/*
 	 * NOTE WELL: this is intentionally backwards, because the abbreviation is
-	 * negated relative to the original value, to handle NaN.
+	 * negated relative to the original value, to handle NaN/infinity cases.
 	 */
 	if (DatumGetNumericAbbrev(x) < DatumGetNumericAbbrev(y))
 		return 1;
@@ -2150,20 +2368,42 @@ cmp_numerics(Numeric num1, Numeric num2)
 	int			result;
 
 	/*
-	 * We consider all NANs to be equal and larger than any non-NAN. This is
-	 * somewhat arbitrary; the important thing is to have a consistent sort
-	 * order.
+	 * We consider all NANs to be equal and larger than any non-NAN (including
+	 * Infinity).  This is somewhat arbitrary; the important thing is to have
+	 * a consistent sort order.
 	 */
-	if (NUMERIC_IS_NAN(num1))
+	if (NUMERIC_IS_SPECIAL(num1))
 	{
-		if (NUMERIC_IS_NAN(num2))
-			result = 0;			/* NAN = NAN */
-		else
-			result = 1;			/* NAN > non-NAN */
+		if (NUMERIC_IS_NAN(num1))
+		{
+			if (NUMERIC_IS_NAN(num2))
+				result = 0;		/* NAN = NAN */
+			else
+				result = 1;		/* NAN > non-NAN */
+		}
+		else if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_NAN(num2))
+				result = -1;	/* PINF < NAN */
+			else if (NUMERIC_IS_PINF(num2))
+				result = 0;		/* PINF = PINF */
+			else
+				result = 1;		/* PINF > anything else */
+		}
+		else					/* num1 must be NINF */
+		{
+			if (NUMERIC_IS_NINF(num2))
+				result = 0;		/* NINF = NINF */
+			else
+				result = -1;	/* NINF < anything else */
+		}
 	}
-	else if (NUMERIC_IS_NAN(num2))
+	else if (NUMERIC_IS_SPECIAL(num2))
 	{
-		result = -1;			/* non-NAN < NAN */
+		if (NUMERIC_IS_NINF(num2))
+			result = 1;			/* normal > NINF */
+		else
+			result = -1;		/* normal < NAN or PINF */
 	}
 	else
 	{
@@ -2190,10 +2430,12 @@ in_range_numeric_numeric(PG_FUNCTION_ARGS)
 	bool		result;
 
 	/*
-	 * Reject negative or NaN offset.  Negative is per spec, and NaN is
-	 * because appropriate semantics for that seem non-obvious.
+	 * Reject negative (including -Inf) or NaN offset.  Negative is per spec,
+	 * and NaN is because appropriate semantics for that seem non-obvious.
 	 */
-	if (NUMERIC_IS_NAN(offset) || NUMERIC_SIGN(offset) == NUMERIC_NEG)
+	if (NUMERIC_IS_NAN(offset) ||
+		NUMERIC_IS_NINF(offset) ||
+		NUMERIC_SIGN(offset) == NUMERIC_NEG)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE),
 				 errmsg("invalid preceding or following size in window function")));
@@ -2214,6 +2456,67 @@ in_range_numeric_numeric(PG_FUNCTION_ARGS)
 	{
 		result = less;			/* non-NAN < NAN */
 	}
+
+	/*
+	 * Deal with infinite offset (necessarily +Inf, at this point).
+	 */
+	else if (NUMERIC_IS_SPECIAL(offset))
+	{
+		Assert(NUMERIC_IS_PINF(offset));
+		if (sub ? NUMERIC_IS_PINF(base) : NUMERIC_IS_NINF(base))
+		{
+			/*
+			 * base +/- offset would produce NaN, so return true for any val
+			 * (see in_range_float8_float8() for reasoning).
+			 */
+			result = true;
+		}
+		else if (sub)
+		{
+			/* base - offset must be -inf */
+			if (less)
+				result = NUMERIC_IS_NINF(val);	/* only -inf is <= sum */
+			else
+				result = true;	/* any val is >= sum */
+		}
+		else
+		{
+			/* base + offset must be +inf */
+			if (less)
+				result = true;	/* any val is <= sum */
+			else
+				result = NUMERIC_IS_PINF(val);	/* only +inf is >= sum */
+		}
+	}
+
+	/*
+	 * Deal with cases where val and/or base is infinite.  The offset, being
+	 * now known finite, cannot affect the conclusion.
+	 */
+	else if (NUMERIC_IS_SPECIAL(val))
+	{
+		if (NUMERIC_IS_PINF(val))
+		{
+			if (NUMERIC_IS_PINF(base))
+				result = true;	/* PINF = PINF */
+			else
+				result = !less; /* PINF > any other non-NAN */
+		}
+		else					/* val must be NINF */
+		{
+			if (NUMERIC_IS_NINF(base))
+				result = true;	/* NINF = NINF */
+			else
+				result = less;	/* NINF < anything else */
+		}
+	}
+	else if (NUMERIC_IS_SPECIAL(base))
+	{
+		if (NUMERIC_IS_NINF(base))
+			result = !less;		/* normal > NINF */
+		else
+			result = less;		/* normal < PINF */
+	}
 	else
 	{
 		/*
@@ -2264,8 +2567,8 @@ hash_numeric(PG_FUNCTION_ARGS)
 	int			hash_len;
 	NumericDigit *digits;
 
-	/* If it's NaN, don't try to hash the rest of the fields */
-	if (NUMERIC_IS_NAN(key))
+	/* If it's NaN or infinity, don't try to hash the rest of the fields */
+	if (NUMERIC_IS_SPECIAL(key))
 		PG_RETURN_UINT32(0);
 
 	weight = NUMERIC_WEIGHT(key);
@@ -2345,7 +2648,8 @@ hash_numeric_extended(PG_FUNCTION_ARGS)
 	int			hash_len;
 	NumericDigit *digits;
 
-	if (NUMERIC_IS_NAN(key))
+	/* If it's NaN or infinity, don't try to hash the rest of the fields */
+	if (NUMERIC_IS_SPECIAL(key))
 		PG_RETURN_UINT64(seed);
 
 	weight = NUMERIC_WEIGHT(key);
@@ -2429,10 +2733,32 @@ numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_NINF(num2))
+				return make_result(&const_nan); /* Inf + -Inf */
+			else
+				return make_result(&const_pinf);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_PINF(num2))
+				return make_result(&const_nan); /* -Inf + Inf */
+			else
+				return make_result(&const_ninf);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+		if (NUMERIC_IS_PINF(num2))
+			return make_result(&const_pinf);
+		Assert(NUMERIC_IS_NINF(num2));
+		return make_result(&const_ninf);
+	}
 
 	/*
 	 * Unpack the values, let add_var() compute the result and return it.
@@ -2485,10 +2811,32 @@ numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_PINF(num2))
+				return make_result(&const_nan); /* Inf - Inf */
+			else
+				return make_result(&const_pinf);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_NINF(num2))
+				return make_result(&const_nan); /* -Inf - -Inf */
+			else
+				return make_result(&const_ninf);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+		if (NUMERIC_IS_PINF(num2))
+			return make_result(&const_ninf);
+		Assert(NUMERIC_IS_NINF(num2));
+		return make_result(&const_pinf);
+	}
 
 	/*
 	 * Unpack the values, let sub_var() compute the result and return it.
@@ -2541,10 +2889,64 @@ numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					return make_result(&const_nan); /* Inf * 0 */
+				case 1:
+					return make_result(&const_pinf);
+				case -1:
+					return make_result(&const_ninf);
+			}
+			Assert(false);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					return make_result(&const_nan); /* -Inf * 0 */
+				case 1:
+					return make_result(&const_ninf);
+				case -1:
+					return make_result(&const_pinf);
+			}
+			Assert(false);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+		if (NUMERIC_IS_PINF(num2))
+		{
+			switch (numeric_sign_internal(num1))
+			{
+				case 0:
+					return make_result(&const_nan); /* 0 * Inf */
+				case 1:
+					return make_result(&const_pinf);
+				case -1:
+					return make_result(&const_ninf);
+			}
+			Assert(false);
+		}
+		Assert(NUMERIC_IS_NINF(num2));
+		switch (numeric_sign_internal(num1))
+		{
+			case 0:
+				return make_result(&const_nan); /* 0 * -Inf */
+			case 1:
+				return make_result(&const_ninf);
+			case -1:
+				return make_result(&const_pinf);
+		}
+		Assert(false);
+	}
 
 	/*
 	 * Unpack the values, let mul_var() compute the result and return it.
@@ -2605,10 +3007,67 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error)
 		*have_error = false;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				return make_result(&const_nan); /* Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					if (have_error)
+					{
+						*have_error = true;
+						return NULL;
+					}
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					return make_result(&const_pinf);
+				case -1:
+					return make_result(&const_ninf);
+			}
+			Assert(false);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				return make_result(&const_nan); /* -Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					if (have_error)
+					{
+						*have_error = true;
+						return NULL;
+					}
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					return make_result(&const_ninf);
+				case -1:
+					return make_result(&const_pinf);
+			}
+			Assert(false);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+
+		/*
+		 * POSIX would have us return zero or minus zero if num1 is zero, and
+		 * otherwise throw an underflow error.  But the numeric type doesn't
+		 * really do underflow, so let's just return zero.
+		 */
+		return make_result(&const_zero);
+	}
 
 	/*
 	 * Unpack the arguments
@@ -2661,10 +3120,57 @@ numeric_div_trunc(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				PG_RETURN_NUMERIC(make_result(&const_nan)); /* Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					PG_RETURN_NUMERIC(make_result(&const_pinf));
+				case -1:
+					PG_RETURN_NUMERIC(make_result(&const_ninf));
+			}
+			Assert(false);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				PG_RETURN_NUMERIC(make_result(&const_nan)); /* -Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					PG_RETURN_NUMERIC(make_result(&const_ninf));
+				case -1:
+					PG_RETURN_NUMERIC(make_result(&const_pinf));
+			}
+			Assert(false);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+
+		/*
+		 * POSIX would have us return zero or minus zero if num1 is zero, and
+		 * otherwise throw an underflow error.  But the numeric type doesn't
+		 * really do underflow, so let's just return zero.
+		 */
+		PG_RETURN_NUMERIC(make_result(&const_zero));
+	}
 
 	/*
 	 * Unpack the arguments
@@ -2723,8 +3229,34 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	if (have_error)
 		*have_error = false;
 
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	/*
+	 * Handle NaN and infinities.  We follow POSIX fmod() on this, except that
+	 * POSIX treats x-is-infinite and y-is-zero identically, raising EDOM and
+	 * returning NaN.  We choose to throw error only for y-is-zero.
+	 */
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_INF(num1))
+		{
+			if (numeric_sign_internal(num2) == 0)
+			{
+				if (have_error)
+				{
+					*have_error = true;
+					return NULL;
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_DIVISION_BY_ZERO),
+						 errmsg("division by zero")));
+			}
+			/* Inf % any nonzero = NaN */
+			return make_result(&const_nan);
+		}
+		/* num2 must be [-]Inf; result is num1 regardless of sign of num2 */
+		return duplicate_numeric(num1);
+	}
 
 	init_var_from_num(num1, &arg1);
 	init_var_from_num(num2, &arg2);
@@ -2763,10 +3295,10 @@ numeric_inc(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Compute the result and return it
@@ -2850,9 +3382,10 @@ numeric_gcd(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities: we consider the result to be NaN in all such
+	 * cases.
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	/*
@@ -2892,9 +3425,10 @@ numeric_lcm(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities: we consider the result to be NaN in all such
+	 * cases.
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	/*
@@ -3003,10 +3537,18 @@ numeric_sqrt(PG_FUNCTION_ARGS)
 	int			rscale;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* error should match that in sqrt_var() */
+		if (NUMERIC_IS_NINF(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+					 errmsg("cannot take square root of a negative number")));
+		/* For NAN or PINF, just duplicate the input */
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	/*
 	 * Unpack the argument and determine the result scale.  We choose a scale
@@ -3054,10 +3596,16 @@ numeric_exp(PG_FUNCTION_ARGS)
 	double		val;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* Per POSIX, exp(-Inf) is zero */
+		if (NUMERIC_IS_NINF(num))
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+		/* For NAN or PINF, just duplicate the input */
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	/*
 	 * Unpack the argument and determine the result scale.  We choose a scale
@@ -3115,10 +3663,17 @@ numeric_ln(PG_FUNCTION_ARGS)
 	int			rscale;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NINF(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_LOG),
+					 errmsg("cannot take logarithm of a negative number")));
+		/* For NAN or PINF, just duplicate the input */
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	init_var_from_num(num, &arg);
 	init_var(&result);
@@ -3157,10 +3712,39 @@ numeric_log(PG_FUNCTION_ARGS)
 	NumericVar	result;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		int			sign1,
+					sign2;
+
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		/* fail on negative inputs including -Inf, as log_var would */
+		sign1 = numeric_sign_internal(num1);
+		sign2 = numeric_sign_internal(num2);
+		if (sign1 < 0 || sign2 < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_LOG),
+					 errmsg("cannot take logarithm of a negative number")));
+		/* fail on zero inputs, as log_var would */
+		if (sign1 == 0 || sign2 == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_LOG),
+					 errmsg("cannot take logarithm of zero")));
+		if (NUMERIC_IS_PINF(num1))
+		{
+			/* log(Inf, Inf) reduces to Inf/Inf, so it's NaN */
+			if (NUMERIC_IS_PINF(num2))
+				PG_RETURN_NUMERIC(make_result(&const_nan));
+			/* log(Inf, finite-positive) is zero (we don't throw underflow) */
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+		}
+		Assert(NUMERIC_IS_PINF(num2));
+		/* log(finite-positive, Inf) is Inf */
+		PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	/*
 	 * Initialize things
@@ -3186,7 +3770,7 @@ numeric_log(PG_FUNCTION_ARGS)
 /*
  * numeric_power() -
  *
- *	Raise b to the power of x
+ *	Raise x to the power of y
  */
 Datum
 numeric_power(PG_FUNCTION_ARGS)
@@ -3196,60 +3780,170 @@ numeric_power(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	arg1;
 	NumericVar	arg2;
-	NumericVar	arg2_trunc;
 	NumericVar	result;
+	int			sign1,
+				sign2;
 
 	/*
-	 * Handle NaN cases.  We follow the POSIX spec for pow(3), which says that
-	 * NaN ^ 0 = 1, and 1 ^ NaN = 1, while all other cases with NaN inputs
-	 * yield NaN (with no error).
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1))
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
 	{
-		if (!NUMERIC_IS_NAN(num2))
+		/*
+		 * We follow the POSIX spec for pow(3), which says that NaN ^ 0 = 1,
+		 * and 1 ^ NaN = 1, while all other cases with NaN inputs yield NaN
+		 * (with no error).
+		 */
+		if (NUMERIC_IS_NAN(num1))
+		{
+			if (!NUMERIC_IS_SPECIAL(num2))
+			{
+				init_var_from_num(num2, &arg2);
+				if (cmp_var(&arg2, &const_zero) == 0)
+					PG_RETURN_NUMERIC(make_result(&const_one));
+			}
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		}
+		if (NUMERIC_IS_NAN(num2))
+		{
+			if (!NUMERIC_IS_SPECIAL(num1))
+			{
+				init_var_from_num(num1, &arg1);
+				if (cmp_var(&arg1, &const_one) == 0)
+					PG_RETURN_NUMERIC(make_result(&const_one));
+			}
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		}
+		/* At least one input is infinite, but error rules still apply */
+		sign1 = numeric_sign_internal(num1);
+		sign2 = numeric_sign_internal(num2);
+		if (sign1 == 0 && sign2 < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+					 errmsg("zero raised to a negative power is undefined")));
+		if (sign1 < 0 && !numeric_is_integral(num2))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+					 errmsg("a negative number raised to a non-integer power yields a complex result")));
+
+		/*
+		 * POSIX gives this series of rules for pow(3) with infinite inputs:
+		 *
+		 * For any value of y, if x is +1, 1.0 shall be returned.
+		 */
+		if (!NUMERIC_IS_SPECIAL(num1))
 		{
-			init_var_from_num(num2, &arg2);
-			if (cmp_var(&arg2, &const_zero) == 0)
+			init_var_from_num(num1, &arg1);
+			if (cmp_var(&arg1, &const_one) == 0)
 				PG_RETURN_NUMERIC(make_result(&const_one));
 		}
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-	}
-	if (NUMERIC_IS_NAN(num2))
-	{
-		init_var_from_num(num1, &arg1);
-		if (cmp_var(&arg1, &const_one) == 0)
+
+		/*
+		 * For any value of x, if y is [-]0, 1.0 shall be returned.
+		 */
+		if (sign2 == 0)
 			PG_RETURN_NUMERIC(make_result(&const_one));
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-	}
 
-	/*
-	 * Initialize things
-	 */
-	init_var(&arg2_trunc);
-	init_var(&result);
-	init_var_from_num(num1, &arg1);
-	init_var_from_num(num2, &arg2);
+		/*
+		 * For any odd integer value of y > 0, if x is [-]0, [-]0 shall be
+		 * returned.  For y > 0 and not an odd integer, if x is [-]0, +0 shall
+		 * be returned.  (Since we don't deal in minus zero, we need not
+		 * distinguish these two cases.)
+		 */
+		if (sign1 == 0 && sign2 > 0)
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+
+		/*
+		 * If x is -1, and y is [-]Inf, 1.0 shall be returned.
+		 *
+		 * For |x| < 1, if y is -Inf, +Inf shall be returned.
+		 *
+		 * For |x| > 1, if y is -Inf, +0 shall be returned.
+		 *
+		 * For |x| < 1, if y is +Inf, +0 shall be returned.
+		 *
+		 * For |x| > 1, if y is +Inf, +Inf shall be returned.
+		 */
+		if (NUMERIC_IS_INF(num2))
+		{
+			bool		abs_x_gt_one;
+
+			if (NUMERIC_IS_SPECIAL(num1))
+				abs_x_gt_one = true;	/* x is either Inf or -Inf */
+			else
+			{
+				init_var_from_num(num1, &arg1);
+				if (cmp_var(&arg1, &const_minus_one) == 0)
+					PG_RETURN_NUMERIC(make_result(&const_one));
+				arg1.sign = NUMERIC_POS;	/* now arg1 = abs(x) */
+				abs_x_gt_one = (cmp_var(&arg1, &const_one) > 0);
+			}
+			if (abs_x_gt_one == (sign2 > 0))
+				PG_RETURN_NUMERIC(make_result(&const_pinf));
+			else
+				PG_RETURN_NUMERIC(make_result(&const_zero));
+		}
+
+		/*
+		 * For y < 0, if x is +Inf, +0 shall be returned.
+		 *
+		 * For y > 0, if x is +Inf, +Inf shall be returned.
+		 */
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (sign2 > 0)
+				PG_RETURN_NUMERIC(make_result(&const_pinf));
+			else
+				PG_RETURN_NUMERIC(make_result(&const_zero));
+		}
+
+		Assert(NUMERIC_IS_NINF(num1));
+
+		/*
+		 * For y an odd integer < 0, if x is -Inf, -0 shall be returned.  For
+		 * y < 0 and not an odd integer, if x is -Inf, +0 shall be returned.
+		 * (Again, we need not distinguish these two cases.)
+		 */
+		if (sign2 < 0)
+			PG_RETURN_NUMERIC(make_result(&const_zero));
 
-	set_var_from_var(&arg2, &arg2_trunc);
-	trunc_var(&arg2_trunc, 0);
+		/*
+		 * For y an odd integer > 0, if x is -Inf, -Inf shall be returned. For
+		 * y > 0 and not an odd integer, if x is -Inf, +Inf shall be returned.
+		 */
+		init_var_from_num(num2, &arg2);
+		if (arg2.ndigits > 0 && arg2.ndigits == arg2.weight + 1 &&
+			(arg2.digits[arg2.ndigits - 1] & 1))
+			PG_RETURN_NUMERIC(make_result(&const_ninf));
+		else
+			PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	/*
 	 * The SQL spec requires that we emit a particular SQLSTATE error code for
 	 * certain error conditions.  Specifically, we don't return a
 	 * divide-by-zero error code for 0 ^ -1.
 	 */
-	if (cmp_var(&arg1, &const_zero) == 0 &&
-		cmp_var(&arg2, &const_zero) < 0)
+	sign1 = numeric_sign_internal(num1);
+	sign2 = numeric_sign_internal(num2);
+
+	if (sign1 == 0 && sign2 < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
 				 errmsg("zero raised to a negative power is undefined")));
 
-	if (cmp_var(&arg1, &const_zero) < 0 &&
-		cmp_var(&arg2, &arg2_trunc) != 0)
+	if (sign1 < 0 && !numeric_is_integral(num2))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
 				 errmsg("a negative number raised to a non-integer power yields a complex result")));
 
+	/*
+	 * Initialize things
+	 */
+	init_var(&result);
+	init_var_from_num(num1, &arg1);
+	init_var_from_num(num2, &arg2);
+
 	/*
 	 * Call power_var() to compute and return the result; note it handles
 	 * scale selection itself.
@@ -3259,7 +3953,6 @@ numeric_power(PG_FUNCTION_ARGS)
 	res = make_result(&result);
 
 	free_var(&result);
-	free_var(&arg2_trunc);
 
 	PG_RETURN_NUMERIC(res);
 }
@@ -3274,7 +3967,7 @@ numeric_scale(PG_FUNCTION_ARGS)
 {
 	Numeric		num = PG_GETARG_NUMERIC(0);
 
-	if (NUMERIC_IS_NAN(num))
+	if (NUMERIC_IS_SPECIAL(num))
 		PG_RETURN_NULL();
 
 	PG_RETURN_INT32(NUMERIC_DSCALE(num));
@@ -3341,7 +4034,7 @@ numeric_min_scale(PG_FUNCTION_ARGS)
 	NumericVar	arg;
 	int			min_scale;
 
-	if (NUMERIC_IS_NAN(num))
+	if (NUMERIC_IS_SPECIAL(num))
 		PG_RETURN_NULL();
 
 	init_var_from_num(num, &arg);
@@ -3361,8 +4054,8 @@ numeric_trim_scale(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	init_var_from_num(num, &result);
 	result.dscale = get_min_scale(&result);
@@ -3408,8 +4101,7 @@ numeric_int4_opt_error(Numeric num, bool *have_error)
 	if (have_error)
 		*have_error = false;
 
-	/* XXX would it be better to return NULL? */
-	if (NUMERIC_IS_NAN(num))
+	if (NUMERIC_IS_SPECIAL(num))
 	{
 		if (have_error)
 		{
@@ -3418,9 +4110,14 @@ numeric_int4_opt_error(Numeric num, bool *have_error)
 		}
 		else
 		{
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("cannot convert NaN to integer")));
+			if (NUMERIC_IS_NAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot convert NaN to integer")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot convert infinity to integer")));
 		}
 	}
 
@@ -3499,11 +4196,17 @@ numeric_int8(PG_FUNCTION_ARGS)
 	NumericVar	x;
 	int64		result;
 
-	/* XXX would it be better to return NULL? */
-	if (NUMERIC_IS_NAN(num))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert NaN to bigint")));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert NaN to bigint")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert infinity to bigint")));
+	}
 
 	/* Convert to variable format and thence to int8 */
 	init_var_from_num(num, &x);
@@ -3544,11 +4247,17 @@ numeric_int2(PG_FUNCTION_ARGS)
 	int64		val;
 	int16		result;
 
-	/* XXX would it be better to return NULL? */
-	if (NUMERIC_IS_NAN(num))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert NaN to smallint")));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert NaN to smallint")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert infinity to smallint")));
+	}
 
 	/* Convert to variable format and thence to int8 */
 	init_var_from_num(num, &x);
@@ -3583,9 +4292,12 @@ float8_numeric(PG_FUNCTION_ARGS)
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	if (isinf(val))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert infinity to numeric")));
+	{
+		if (val < 0)
+			PG_RETURN_NUMERIC(make_result(&const_ninf));
+		else
+			PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	snprintf(buf, sizeof(buf), "%.*g", DBL_DIG, val);
 
@@ -3609,8 +4321,15 @@ numeric_float8(PG_FUNCTION_ARGS)
 	char	   *tmp;
 	Datum		result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_FLOAT8(get_float8_nan());
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			PG_RETURN_FLOAT8(get_float8_infinity());
+		else if (NUMERIC_IS_NINF(num))
+			PG_RETURN_FLOAT8(-get_float8_infinity());
+		else
+			PG_RETURN_FLOAT8(get_float8_nan());
+	}
 
 	tmp = DatumGetCString(DirectFunctionCall1(numeric_out,
 											  NumericGetDatum(num)));
@@ -3634,10 +4353,22 @@ numeric_float8_no_overflow(PG_FUNCTION_ARGS)
 	Numeric		num = PG_GETARG_NUMERIC(0);
 	double		val;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_FLOAT8(get_float8_nan());
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			val = HUGE_VAL;
+		else if (NUMERIC_IS_NINF(num))
+			val = -HUGE_VAL;
+		else
+			val = get_float8_nan();
+	}
+	else
+	{
+		NumericVar	x;
 
-	val = numeric_to_double_no_overflow(num);
+		init_var_from_num(num, &x);
+		val = numericvar_to_double_no_overflow(&x);
+	}
 
 	PG_RETURN_FLOAT8(val);
 }
@@ -3654,9 +4385,12 @@ float4_numeric(PG_FUNCTION_ARGS)
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	if (isinf(val))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert infinity to numeric")));
+	{
+		if (val < 0)
+			PG_RETURN_NUMERIC(make_result(&const_ninf));
+		else
+			PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	snprintf(buf, sizeof(buf), "%.*g", FLT_DIG, val);
 
@@ -3680,8 +4414,15 @@ numeric_float4(PG_FUNCTION_ARGS)
 	char	   *tmp;
 	Datum		result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_FLOAT4(get_float4_nan());
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			PG_RETURN_FLOAT4(get_float4_infinity());
+		else if (NUMERIC_IS_NINF(num))
+			PG_RETURN_FLOAT4(-get_float4_infinity());
+		else
+			PG_RETURN_FLOAT4(get_float4_nan());
+	}
 
 	tmp = DatumGetCString(DirectFunctionCall1(numeric_out,
 											  NumericGetDatum(num)));
@@ -3701,10 +4442,17 @@ numeric_pg_lsn(PG_FUNCTION_ARGS)
 	NumericVar	x;
 	XLogRecPtr	result;
 
-	if (NUMERIC_IS_NAN(num))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert NaN to pg_lsn")));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert NaN to pg_lsn")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert infinity to pg_lsn")));
+	}
 
 	/* Convert to variable format and thence to pg_lsn */
 	init_var_from_num(num, &x);
@@ -3741,9 +4489,15 @@ typedef struct NumericAggState
 	NumericSumAccum sumX2;		/* sum of squares of processed numbers */
 	int			maxScale;		/* maximum scale seen so far */
 	int64		maxScaleCount;	/* number of values seen with maximum scale */
-	int64		NaNcount;		/* count of NaN values (not included in N!) */
+	/* These counts are *not* included in N!  Use NA_TOTAL_COUNT() as needed */
+	int64		NaNcount;		/* count of NaN values */
+	int64		pInfcount;		/* count of +Inf values */
+	int64		nInfcount;		/* count of -Inf values */
 } NumericAggState;
 
+#define NA_TOTAL_COUNT(na) \
+	((na)->N + (na)->NaNcount + (na)->pInfcount + (na)->nInfcount)
+
 /*
  * Prepare state data for a numeric aggregate function that needs to compute
  * sum, count and optionally sum of squares of the input.
@@ -3795,10 +4549,15 @@ do_numeric_accum(NumericAggState *state, Numeric newval)
 	NumericVar	X2;
 	MemoryContext old_context;
 
-	/* Count NaN inputs separately from all else */
-	if (NUMERIC_IS_NAN(newval))
+	/* Count NaN/infinity inputs separately from all else */
+	if (NUMERIC_IS_SPECIAL(newval))
 	{
-		state->NaNcount++;
+		if (NUMERIC_IS_PINF(newval))
+			state->pInfcount++;
+		else if (NUMERIC_IS_NINF(newval))
+			state->nInfcount++;
+		else
+			state->NaNcount++;
 		return;
 	}
 
@@ -3860,10 +4619,15 @@ do_numeric_discard(NumericAggState *state, Numeric newval)
 	NumericVar	X2;
 	MemoryContext old_context;
 
-	/* Count NaN inputs separately from all else */
-	if (NUMERIC_IS_NAN(newval))
+	/* Count NaN/infinity inputs separately from all else */
+	if (NUMERIC_IS_SPECIAL(newval))
 	{
-		state->NaNcount--;
+		if (NUMERIC_IS_PINF(newval))
+			state->pInfcount--;
+		else if (NUMERIC_IS_NINF(newval))
+			state->nInfcount--;
+		else
+			state->NaNcount--;
 		return true;
 	}
 
@@ -3986,6 +4750,8 @@ numeric_combine(PG_FUNCTION_ARGS)
 		state1 = makeNumericAggStateCurrentContext(true);
 		state1->N = state2->N;
 		state1->NaNcount = state2->NaNcount;
+		state1->pInfcount = state2->pInfcount;
+		state1->nInfcount = state2->nInfcount;
 		state1->maxScale = state2->maxScale;
 		state1->maxScaleCount = state2->maxScaleCount;
 
@@ -3999,6 +4765,8 @@ numeric_combine(PG_FUNCTION_ARGS)
 
 	state1->N += state2->N;
 	state1->NaNcount += state2->NaNcount;
+	state1->pInfcount += state2->pInfcount;
+	state1->nInfcount += state2->nInfcount;
 
 	if (state2->N > 0)
 	{
@@ -4074,6 +4842,8 @@ numeric_avg_combine(PG_FUNCTION_ARGS)
 		state1 = makeNumericAggStateCurrentContext(false);
 		state1->N = state2->N;
 		state1->NaNcount = state2->NaNcount;
+		state1->pInfcount = state2->pInfcount;
+		state1->nInfcount = state2->nInfcount;
 		state1->maxScale = state2->maxScale;
 		state1->maxScaleCount = state2->maxScaleCount;
 
@@ -4086,6 +4856,8 @@ numeric_avg_combine(PG_FUNCTION_ARGS)
 
 	state1->N += state2->N;
 	state1->NaNcount += state2->NaNcount;
+	state1->pInfcount += state2->pInfcount;
+	state1->nInfcount += state2->nInfcount;
 
 	if (state2->N > 0)
 	{
@@ -4164,6 +4936,12 @@ numeric_avg_serialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	pq_sendint64(&buf, state->NaNcount);
 
+	/* pInfcount */
+	pq_sendint64(&buf, state->pInfcount);
+
+	/* nInfcount */
+	pq_sendint64(&buf, state->nInfcount);
+
 	result = pq_endtypsend(&buf);
 
 	PG_RETURN_BYTEA_P(result);
@@ -4218,6 +4996,12 @@ numeric_avg_deserialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	result->NaNcount = pq_getmsgint64(&buf);
 
+	/* pInfcount */
+	result->pInfcount = pq_getmsgint64(&buf);
+
+	/* nInfcount */
+	result->nInfcount = pq_getmsgint64(&buf);
+
 	pq_getmsgend(&buf);
 	pfree(buf.data);
 
@@ -4286,6 +5070,12 @@ numeric_serialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	pq_sendint64(&buf, state->NaNcount);
 
+	/* pInfcount */
+	pq_sendint64(&buf, state->pInfcount);
+
+	/* nInfcount */
+	pq_sendint64(&buf, state->nInfcount);
+
 	result = pq_endtypsend(&buf);
 
 	PG_RETURN_BYTEA_P(result);
@@ -4349,6 +5139,12 @@ numeric_deserialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	result->NaNcount = pq_getmsgint64(&buf);
 
+	/* pInfcount */
+	result->pInfcount = pq_getmsgint64(&buf);
+
+	/* nInfcount */
+	result->nInfcount = pq_getmsgint64(&buf);
+
 	pq_getmsgend(&buf);
 	pfree(buf.data);
 
@@ -5141,12 +5937,20 @@ numeric_avg(PG_FUNCTION_ARGS)
 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
-	if (state == NULL || (state->N + state->NaNcount) == 0)
+	if (state == NULL || NA_TOTAL_COUNT(state) == 0)
 		PG_RETURN_NULL();
 
 	if (state->NaNcount > 0)	/* there was at least one NaN input */
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
+	/* adding plus and minus infinities gives NaN */
+	if (state->pInfcount > 0 && state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (state->pInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_pinf));
+	if (state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_ninf));
+
 	N_datum = DirectFunctionCall1(int8_numeric, Int64GetDatum(state->N));
 
 	init_var(&sumX_var);
@@ -5167,12 +5971,20 @@ numeric_sum(PG_FUNCTION_ARGS)
 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
-	if (state == NULL || (state->N + state->NaNcount) == 0)
+	if (state == NULL || NA_TOTAL_COUNT(state) == 0)
 		PG_RETURN_NULL();
 
 	if (state->NaNcount > 0)	/* there was at least one NaN input */
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
+	/* adding plus and minus infinities gives NaN */
+	if (state->pInfcount > 0 && state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (state->pInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_pinf));
+	if (state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_ninf));
+
 	init_var(&sumX_var);
 	accum_sum_final(&state->sumX, &sumX_var);
 	result = make_result(&sumX_var);
@@ -5208,9 +6020,9 @@ numeric_stddev_internal(NumericAggState *state,
 	/*
 	 * Sample stddev and variance are undefined when N <= 1; population stddev
 	 * is undefined when N == 0.  Return NULL in either case (note that NaNs
-	 * count as normal inputs for this purpose).
+	 * and infinities count as normal inputs for this purpose).
 	 */
-	if (state == NULL || (totCount = state->N + state->NaNcount) == 0)
+	if (state == NULL || (totCount = NA_TOTAL_COUNT(state)) == 0)
 	{
 		*is_null = true;
 		return NULL;
@@ -5225,9 +6037,10 @@ numeric_stddev_internal(NumericAggState *state,
 	*is_null = false;
 
 	/*
-	 * Deal with NaN inputs.
+	 * Deal with NaN and infinity cases.  By analogy to the behavior of the
+	 * float8 functions, any infinity input produces NaN output.
 	 */
-	if (state->NaNcount > 0)
+	if (state->NaNcount > 0 || state->pInfcount > 0 || state->nInfcount > 0)
 		return make_result(&const_nan);
 
 	/* OK, normal calculation applies */
@@ -5870,6 +6683,12 @@ dump_numeric(const char *str, Numeric num)
 		case NUMERIC_NAN:
 			printf("NaN");
 			break;
+		case NUMERIC_PINF:
+			printf("Infinity");
+			break;
+		case NUMERIC_NINF:
+			printf("-Infinity");
+			break;
 		default:
 			printf("SIGN=0x%x", NUMERIC_SIGN(num));
 			break;
@@ -5901,6 +6720,12 @@ dump_var(const char *str, NumericVar *var)
 		case NUMERIC_NAN:
 			printf("NaN");
 			break;
+		case NUMERIC_PINF:
+			printf("Infinity");
+			break;
+		case NUMERIC_NINF:
+			printf("-Infinity");
+			break;
 		default:
 			printf("SIGN=0x%x", var->sign);
 			break;
@@ -5918,8 +6743,9 @@ dump_var(const char *str, NumericVar *var)
  *
  * Local functions follow
  *
- * In general, these do not support NaNs --- callers must eliminate
- * the possibility of NaN first.  (make_result() is an exception.)
+ * In general, these do not support "special" (NaN or infinity) inputs;
+ * callers should handle those possibilities first.
+ * (There are one or two exceptions, noted in their header comments.)
  *
  * ----------------------------------------------------------------------
  */
@@ -5979,9 +6805,9 @@ zero_var(NumericVar *var)
  *
  *	Parse a string and put the number into a variable
  *
- * This function does not handle leading or trailing spaces, and it doesn't
- * accept "NaN" either.  It returns the end+1 position so that caller can
- * check for trailing spaces/garbage if deemed necessary.
+ * This function does not handle leading or trailing spaces.  It returns
+ * the end+1 position parsed, so that caller can check for trailing
+ * spaces/garbage if deemed necessary.
  *
  * cp is the place to actually start parsing; str is what to use in error
  * reports.  (Typically cp would be the same except advanced over spaces.)
@@ -6455,13 +7281,29 @@ get_str_from_var_sci(const NumericVar *var, int rscale)
 }
 
 
+/*
+ * duplicate_numeric() - copy a packed-format Numeric
+ *
+ * This will handle NaN and Infinity cases.
+ */
+static Numeric
+duplicate_numeric(Numeric num)
+{
+	Numeric		res;
+
+	res = (Numeric) palloc(VARSIZE(num));
+	memcpy(res, num, VARSIZE(num));
+	return res;
+}
+
 /*
  * make_result_opt_error() -
  *
  *	Create the packed db numeric format in palloc()'d memory from
- *	a variable.  If "*have_error" flag is provided, on error it's set to
- *	true, NULL returned.  This is helpful when caller need to handle errors
- *	by itself.
+ *	a variable.  This will handle NaN and Infinity cases.
+ *
+ *	If "have_error" isn't NULL, on overflow *have_error is set to true and
+ *	NULL is returned.  This is helpful when caller needs to handle errors.
  */
 static Numeric
 make_result_opt_error(const NumericVar *var, bool *have_error)
@@ -6476,12 +7318,22 @@ make_result_opt_error(const NumericVar *var, bool *have_error)
 	if (have_error)
 		*have_error = false;
 
-	if (sign == NUMERIC_NAN)
+	if ((sign & NUMERIC_SIGN_MASK) == NUMERIC_SPECIAL)
 	{
+		/*
+		 * Verify valid special value.  This could be just an Assert, perhaps,
+		 * but it seems worthwhile to expend a few cycles to ensure that we
+		 * never write any nonzero reserved bits to disk.
+		 */
+		if (!(sign == NUMERIC_NAN ||
+			  sign == NUMERIC_PINF ||
+			  sign == NUMERIC_NINF))
+			elog(ERROR, "invalid numeric sign value 0x%x", sign);
+
 		result = (Numeric) palloc(NUMERIC_HDRSZ_SHORT);
 
 		SET_VARSIZE(result, NUMERIC_HDRSZ_SHORT);
-		result->choice.n_header = NUMERIC_NAN;
+		result->choice.n_header = sign;
 		/* the header word is all we need */
 
 		dump_numeric("make_result()", result);
@@ -6572,8 +7424,8 @@ make_result(const NumericVar *var)
 /*
  * apply_typmod() -
  *
- *	Do bounds checking and rounding according to the attributes
- *	typmod field.
+ *	Do bounds checking and rounding according to the specified typmod.
+ *	Note that this is only applied to normal finite values.
  */
 static void
 apply_typmod(NumericVar *var, int32 typmod)
@@ -6646,6 +7498,45 @@ apply_typmod(NumericVar *var, int32 typmod)
 	}
 }
 
+/*
+ * apply_typmod_special() -
+ *
+ *	Do bounds checking according to the specified typmod, for an Inf or NaN.
+ *	For convenience of most callers, the value is presented in packed form.
+ */
+static void
+apply_typmod_special(Numeric num, int32 typmod)
+{
+	int			precision;
+	int			scale;
+
+	Assert(NUMERIC_IS_SPECIAL(num));	/* caller error if not */
+
+	/*
+	 * NaN is allowed regardless of the typmod; that's rather dubious perhaps,
+	 * but it's a longstanding behavior.  Inf is rejected if we have any
+	 * typmod restriction, since an infinity shouldn't be claimed to fit in
+	 * any finite number of digits.
+	 */
+	if (NUMERIC_IS_NAN(num))
+		return;
+
+	/* Do nothing if we have a default typmod (-1) */
+	if (typmod < (int32) (VARHDRSZ))
+		return;
+
+	typmod -= VARHDRSZ;
+	precision = (typmod >> 16) & 0xffff;
+	scale = typmod & 0xffff;
+
+	ereport(ERROR,
+			(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+			 errmsg("numeric field overflow"),
+			 errdetail("A field with precision %d, scale %d cannot hold an infinite value.",
+					   precision, scale)));
+}
+
+
 /*
  * Convert numeric to int8, rounding if needed.
  *
@@ -6961,36 +7852,9 @@ int128_to_numericvar(int128 val, NumericVar *var)
 #endif
 
 /*
- * Convert numeric to float8; if out of range, return +/- HUGE_VAL
+ * Convert a NumericVar to float8; if out of range, return +/- HUGE_VAL
  */
 static double
-numeric_to_double_no_overflow(Numeric num)
-{
-	char	   *tmp;
-	double		val;
-	char	   *endptr;
-
-	tmp = DatumGetCString(DirectFunctionCall1(numeric_out,
-											  NumericGetDatum(num)));
-
-	/* unlike float8in, we ignore ERANGE from strtod */
-	val = strtod(tmp, &endptr);
-	if (*endptr != '\0')
-	{
-		/* shouldn't happen ... */
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-				 errmsg("invalid input syntax for type %s: \"%s\"",
-						"double precision", tmp)));
-	}
-
-	pfree(tmp);
-
-	return val;
-}
-
-/* As above, but work from a NumericVar */
-static double
 numericvar_to_double_no_overflow(const NumericVar *var)
 {
 	char	   *tmp;
diff --git a/src/include/utils/numeric.h b/src/include/utils/numeric.h
index 0604cb65ed469..0b7d4ba3c4bc0 100644
--- a/src/include/utils/numeric.h
+++ b/src/include/utils/numeric.h
@@ -57,6 +57,7 @@ typedef struct NumericData *Numeric;
  * Utility functions in numeric.c
  */
 extern bool numeric_is_nan(Numeric num);
+extern bool numeric_is_inf(Numeric num);
 int32		numeric_maximum_size(int32 typmod);
 extern char *numeric_out_sci(Numeric num, int scale);
 extern char *numeric_normalize(Numeric num);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 3bd184ae294b2..477fd1205c30b 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -211,6 +211,18 @@ SELECT stddev_pop(3.0::numeric), stddev_samp(4.0::numeric);
           0 |            
 (1 row)
 
+SELECT var_pop('inf'::numeric), var_samp('inf'::numeric);
+ var_pop | var_samp 
+---------+----------
+     NaN |         
+(1 row)
+
+SELECT stddev_pop('inf'::numeric), stddev_samp('inf'::numeric);
+ stddev_pop | stddev_samp 
+------------+-------------
+        NaN |            
+(1 row)
+
 SELECT var_pop('nan'::numeric), var_samp('nan'::numeric);
  var_pop | var_samp 
 ---------+----------
@@ -285,32 +297,74 @@ select avg('NaN'::numeric) from generate_series(1,3);
 (1 row)
 
 -- verify correct results for infinite inputs
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('1'), ('infinity')) v(x);
-   avg    | var_pop 
-----------+---------
- Infinity |     NaN
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
 (1 row)
 
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('1')) v(x);
-   avg    | var_pop 
-----------+---------
- Infinity |     NaN
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
 (1 row)
 
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('infinity')) v(x);
-   avg    | var_pop 
-----------+---------
- Infinity |     NaN
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
 (1 row)
 
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('infinity')) v(x);
+ sum | avg | var_pop 
+-----+-----+---------
+ NaN | NaN |     NaN
+(1 row)
+
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
+    sum    |    avg    | var_pop 
+-----------+-----------+---------
+ -Infinity | -Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('1'), ('infinity')) v(x);
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('1')) v(x);
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('infinity')) v(x);
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
 FROM (VALUES ('-infinity'), ('infinity')) v(x);
- avg | var_pop 
------+---------
- NaN |     NaN
+ sum | avg | var_pop 
+-----+-----+---------
+ NaN | NaN |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
+    sum    |    avg    | var_pop 
+-----------+-----------+---------
+ -Infinity | -Infinity |     NaN
 (1 row)
 
 -- test accuracy with a large input offset
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 81a0c5d40f714..8546ce901fa78 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -660,6 +660,432 @@ SELECT t1.id1, t1.result, t2.expected
 -----+--------+----------
 (0 rows)
 
+-- ******************************
+-- * Check behavior with Inf and NaN inputs.  It's easiest to handle these
+-- * separately from the num_data framework used above, because some input
+-- * combinations will throw errors.
+-- ******************************
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 + x2 AS sum,
+  x1 - x2 AS diff,
+  x1 * x2 AS prod
+FROM v AS v1(x1), v AS v2(x2);
+    x1     |    x2     |    sum    |   diff    |   prod    
+-----------+-----------+-----------+-----------+-----------
+         0 |         0 |         0 |         0 |         0
+         0 |         1 |         1 |        -1 |         0
+         0 |        -1 |        -1 |         1 |         0
+         0 |       4.2 |       4.2 |      -4.2 |       0.0
+         0 |  Infinity |  Infinity | -Infinity |       NaN
+         0 | -Infinity | -Infinity |  Infinity |       NaN
+         0 |       NaN |       NaN |       NaN |       NaN
+         1 |         0 |         1 |         1 |         0
+         1 |         1 |         2 |         0 |         1
+         1 |        -1 |         0 |         2 |        -1
+         1 |       4.2 |       5.2 |      -3.2 |       4.2
+         1 |  Infinity |  Infinity | -Infinity |  Infinity
+         1 | -Infinity | -Infinity |  Infinity | -Infinity
+         1 |       NaN |       NaN |       NaN |       NaN
+        -1 |         0 |        -1 |        -1 |         0
+        -1 |         1 |         0 |        -2 |        -1
+        -1 |        -1 |        -2 |         0 |         1
+        -1 |       4.2 |       3.2 |      -5.2 |      -4.2
+        -1 |  Infinity |  Infinity | -Infinity | -Infinity
+        -1 | -Infinity | -Infinity |  Infinity |  Infinity
+        -1 |       NaN |       NaN |       NaN |       NaN
+       4.2 |         0 |       4.2 |       4.2 |       0.0
+       4.2 |         1 |       5.2 |       3.2 |       4.2
+       4.2 |        -1 |       3.2 |       5.2 |      -4.2
+       4.2 |       4.2 |       8.4 |       0.0 |     17.64
+       4.2 |  Infinity |  Infinity | -Infinity |  Infinity
+       4.2 | -Infinity | -Infinity |  Infinity | -Infinity
+       4.2 |       NaN |       NaN |       NaN |       NaN
+  Infinity |         0 |  Infinity |  Infinity |       NaN
+  Infinity |         1 |  Infinity |  Infinity |  Infinity
+  Infinity |        -1 |  Infinity |  Infinity | -Infinity
+  Infinity |       4.2 |  Infinity |  Infinity |  Infinity
+  Infinity |  Infinity |  Infinity |       NaN |  Infinity
+  Infinity | -Infinity |       NaN |  Infinity | -Infinity
+  Infinity |       NaN |       NaN |       NaN |       NaN
+ -Infinity |         0 | -Infinity | -Infinity |       NaN
+ -Infinity |         1 | -Infinity | -Infinity | -Infinity
+ -Infinity |        -1 | -Infinity | -Infinity |  Infinity
+ -Infinity |       4.2 | -Infinity | -Infinity | -Infinity
+ -Infinity |  Infinity |       NaN | -Infinity | -Infinity
+ -Infinity | -Infinity | -Infinity |       NaN |  Infinity
+ -Infinity |       NaN |       NaN |       NaN |       NaN
+       NaN |         0 |       NaN |       NaN |       NaN
+       NaN |         1 |       NaN |       NaN |       NaN
+       NaN |        -1 |       NaN |       NaN |       NaN
+       NaN |       4.2 |       NaN |       NaN |       NaN
+       NaN |  Infinity |       NaN |       NaN |       NaN
+       NaN | -Infinity |       NaN |       NaN |       NaN
+       NaN |       NaN |       NaN |       NaN |       NaN
+(49 rows)
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 / x2 AS quot,
+  x1 % x2 AS mod,
+  div(x1, x2) AS div
+FROM v AS v1(x1), v AS v2(x2) WHERE x2 != 0;
+    x1     |    x2     |          quot           | mod  |    div    
+-----------+-----------+-------------------------+------+-----------
+         0 |         1 |  0.00000000000000000000 |    0 |         0
+         1 |         1 |  1.00000000000000000000 |    0 |         1
+        -1 |         1 | -1.00000000000000000000 |    0 |        -1
+       4.2 |         1 |      4.2000000000000000 |  0.2 |         4
+  Infinity |         1 |                Infinity |  NaN |  Infinity
+ -Infinity |         1 |               -Infinity |  NaN | -Infinity
+       NaN |         1 |                     NaN |  NaN |       NaN
+         0 |        -1 |  0.00000000000000000000 |    0 |         0
+         1 |        -1 | -1.00000000000000000000 |    0 |        -1
+        -1 |        -1 |  1.00000000000000000000 |    0 |         1
+       4.2 |        -1 |     -4.2000000000000000 |  0.2 |        -4
+  Infinity |        -1 |               -Infinity |  NaN | -Infinity
+ -Infinity |        -1 |                Infinity |  NaN |  Infinity
+       NaN |        -1 |                     NaN |  NaN |       NaN
+         0 |       4.2 |  0.00000000000000000000 |  0.0 |         0
+         1 |       4.2 |  0.23809523809523809524 |  1.0 |         0
+        -1 |       4.2 | -0.23809523809523809524 | -1.0 |         0
+       4.2 |       4.2 |  1.00000000000000000000 |  0.0 |         1
+  Infinity |       4.2 |                Infinity |  NaN |  Infinity
+ -Infinity |       4.2 |               -Infinity |  NaN | -Infinity
+       NaN |       4.2 |                     NaN |  NaN |       NaN
+         0 |  Infinity |                       0 |    0 |         0
+         1 |  Infinity |                       0 |    1 |         0
+        -1 |  Infinity |                       0 |   -1 |         0
+       4.2 |  Infinity |                       0 |  4.2 |         0
+  Infinity |  Infinity |                     NaN |  NaN |       NaN
+ -Infinity |  Infinity |                     NaN |  NaN |       NaN
+       NaN |  Infinity |                     NaN |  NaN |       NaN
+         0 | -Infinity |                       0 |    0 |         0
+         1 | -Infinity |                       0 |    1 |         0
+        -1 | -Infinity |                       0 |   -1 |         0
+       4.2 | -Infinity |                       0 |  4.2 |         0
+  Infinity | -Infinity |                     NaN |  NaN |       NaN
+ -Infinity | -Infinity |                     NaN |  NaN |       NaN
+       NaN | -Infinity |                     NaN |  NaN |       NaN
+         0 |       NaN |                     NaN |  NaN |       NaN
+         1 |       NaN |                     NaN |  NaN |       NaN
+        -1 |       NaN |                     NaN |  NaN |       NaN
+       4.2 |       NaN |                     NaN |  NaN |       NaN
+  Infinity |       NaN |                     NaN |  NaN |       NaN
+ -Infinity |       NaN |                     NaN |  NaN |       NaN
+       NaN |       NaN |                     NaN |  NaN |       NaN
+(42 rows)
+
+SELECT 'inf'::numeric / '0';
+ERROR:  division by zero
+SELECT '-inf'::numeric / '0';
+ERROR:  division by zero
+SELECT 'nan'::numeric / '0';
+ ?column? 
+----------
+      NaN
+(1 row)
+
+SELECT '0'::numeric / '0';
+ERROR:  division by zero
+SELECT 'inf'::numeric % '0';
+ERROR:  division by zero
+SELECT '-inf'::numeric % '0';
+ERROR:  division by zero
+SELECT 'nan'::numeric % '0';
+ ?column? 
+----------
+      NaN
+(1 row)
+
+SELECT '0'::numeric % '0';
+ERROR:  division by zero
+SELECT div('inf'::numeric, '0');
+ERROR:  division by zero
+SELECT div('-inf'::numeric, '0');
+ERROR:  division by zero
+SELECT div('nan'::numeric, '0');
+ div 
+-----
+ NaN
+(1 row)
+
+SELECT div('0'::numeric, '0');
+ERROR:  division by zero
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, -x as minusx, abs(x), floor(x), ceil(x), sign(x), numeric_inc(x) as inc
+FROM v;
+     x     |  minusx   |   abs    |   floor   |   ceil    | sign |    inc    
+-----------+-----------+----------+-----------+-----------+------+-----------
+         0 |         0 |        0 |         0 |         0 |    0 |         1
+         1 |        -1 |        1 |         1 |         1 |    1 |         2
+        -1 |         1 |        1 |        -1 |        -1 |   -1 |         0
+       4.2 |      -4.2 |      4.2 |         4 |         5 |    1 |       5.2
+    -7.777 |     7.777 |    7.777 |        -8 |        -7 |   -1 |    -6.777
+  Infinity | -Infinity | Infinity |  Infinity |  Infinity |    1 |  Infinity
+ -Infinity |  Infinity | Infinity | -Infinity | -Infinity |   -1 | -Infinity
+       NaN |       NaN |      NaN |       NaN |       NaN |  NaN |       NaN
+(8 rows)
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, round(x), round(x,1) as round1, trunc(x), trunc(x,1) as trunc1
+FROM v;
+     x     |   round   |  round1   |   trunc   |  trunc1   
+-----------+-----------+-----------+-----------+-----------
+         0 |         0 |       0.0 |         0 |       0.0
+         1 |         1 |       1.0 |         1 |       1.0
+        -1 |        -1 |      -1.0 |        -1 |      -1.0
+       4.2 |         4 |       4.2 |         4 |       4.2
+    -7.777 |        -8 |      -7.8 |        -7 |      -7.7
+  Infinity |  Infinity |  Infinity |  Infinity |  Infinity
+ -Infinity | -Infinity | -Infinity | -Infinity | -Infinity
+       NaN |       NaN |       NaN |       NaN |       NaN
+(8 rows)
+
+-- the large values fall into the numeric abbreviation code's maximal classes
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('1e340'),('-1e340'),
+         ('inf'),('-inf'),('nan'),
+         ('inf'),('-inf'),('nan'))
+SELECT substring(x::text, 1, 32)
+FROM v ORDER BY x;
+            substring             
+----------------------------------
+ -Infinity
+ -Infinity
+ -1000000000000000000000000000000
+ -7.777
+ -1
+ 0
+ 1
+ 4.2
+ 10000000000000000000000000000000
+ Infinity
+ Infinity
+ NaN
+ NaN
+(13 rows)
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('4.2'),('inf'),('nan'))
+SELECT x, sqrt(x)
+FROM v;
+    x     |       sqrt        
+----------+-------------------
+        0 | 0.000000000000000
+        1 | 1.000000000000000
+      4.2 | 2.049390153191920
+ Infinity |          Infinity
+      NaN |               NaN
+(5 rows)
+
+SELECT sqrt('-1'::numeric);
+ERROR:  cannot take square root of a negative number
+SELECT sqrt('-inf'::numeric);
+ERROR:  cannot take square root of a negative number
+WITH v(x) AS
+  (VALUES('1'::numeric),('4.2'),('inf'),('nan'))
+SELECT x,
+  log(x),
+  log10(x),
+  ln(x)
+FROM v;
+    x     |        log         |       log10        |         ln         
+----------+--------------------+--------------------+--------------------
+        1 | 0.0000000000000000 | 0.0000000000000000 | 0.0000000000000000
+      4.2 | 0.6232492903979005 | 0.6232492903979005 | 1.4350845252893226
+ Infinity |           Infinity |           Infinity |           Infinity
+      NaN |                NaN |                NaN |                NaN
+(4 rows)
+
+SELECT ln('0'::numeric);
+ERROR:  cannot take logarithm of zero
+SELECT ln('-1'::numeric);
+ERROR:  cannot take logarithm of a negative number
+SELECT ln('-inf'::numeric);
+ERROR:  cannot take logarithm of a negative number
+WITH v(x) AS
+  (VALUES('2'::numeric),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  log(x1, x2)
+FROM v AS v1(x1), v AS v2(x2);
+    x1    |    x2    |        log         
+----------+----------+--------------------
+        2 |        2 | 1.0000000000000000
+        2 |      4.2 | 2.0703893278913979
+        2 | Infinity |           Infinity
+        2 |      NaN |                NaN
+      4.2 |        2 | 0.4830009440873890
+      4.2 |      4.2 | 1.0000000000000000
+      4.2 | Infinity |           Infinity
+      4.2 |      NaN |                NaN
+ Infinity |        2 |                  0
+ Infinity |      4.2 |                  0
+ Infinity | Infinity |                NaN
+ Infinity |      NaN |                NaN
+      NaN |        2 |                NaN
+      NaN |      4.2 |                NaN
+      NaN | Infinity |                NaN
+      NaN |      NaN |                NaN
+(16 rows)
+
+SELECT log('0'::numeric, '10');
+ERROR:  cannot take logarithm of zero
+SELECT log('10'::numeric, '0');
+ERROR:  cannot take logarithm of zero
+SELECT log('-inf'::numeric, '10');
+ERROR:  cannot take logarithm of a negative number
+SELECT log('10'::numeric, '-inf');
+ERROR:  cannot take logarithm of a negative number
+SELECT log('inf'::numeric, '0');
+ERROR:  cannot take logarithm of zero
+SELECT log('inf'::numeric, '-inf');
+ERROR:  cannot take logarithm of a negative number
+SELECT log('-inf'::numeric, 'inf');
+ERROR:  cannot take logarithm of a negative number
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('2'),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  power(x1, x2)
+FROM v AS v1(x1), v AS v2(x2) WHERE x1 != 0 OR x2 >= 0;
+    x1    |    x2    |        power        
+----------+----------+---------------------
+        0 |        0 |  1.0000000000000000
+        0 |        1 |  0.0000000000000000
+        0 |        2 |  0.0000000000000000
+        0 |      4.2 |  0.0000000000000000
+        0 | Infinity |                   0
+        0 |      NaN |                 NaN
+        1 |        0 |  1.0000000000000000
+        1 |        1 |  1.0000000000000000
+        1 |        2 |  1.0000000000000000
+        1 |      4.2 |  1.0000000000000000
+        1 | Infinity |                   1
+        1 |      NaN |                   1
+        2 |        0 |  1.0000000000000000
+        2 |        1 |  2.0000000000000000
+        2 |        2 |  4.0000000000000000
+        2 |      4.2 |  18.379173679952560
+        2 | Infinity |            Infinity
+        2 |      NaN |                 NaN
+      4.2 |        0 |  1.0000000000000000
+      4.2 |        1 |  4.2000000000000000
+      4.2 |        2 | 17.6400000000000000
+      4.2 |      4.2 |  414.61691860129675
+      4.2 | Infinity |            Infinity
+      4.2 |      NaN |                 NaN
+ Infinity |        0 |                   1
+ Infinity |        1 |            Infinity
+ Infinity |        2 |            Infinity
+ Infinity |      4.2 |            Infinity
+ Infinity | Infinity |            Infinity
+ Infinity |      NaN |                 NaN
+      NaN |        0 |                   1
+      NaN |        1 |                 NaN
+      NaN |        2 |                 NaN
+      NaN |      4.2 |                 NaN
+      NaN | Infinity |                 NaN
+      NaN |      NaN |                 NaN
+(36 rows)
+
+SELECT power('0'::numeric, '-1');
+ERROR:  zero raised to a negative power is undefined
+SELECT power('0'::numeric, '-inf');
+ERROR:  zero raised to a negative power is undefined
+SELECT power('-1'::numeric, 'inf');
+ power 
+-------
+     1
+(1 row)
+
+SELECT power('-2'::numeric, '3');
+        power        
+---------------------
+ -8.0000000000000000
+(1 row)
+
+SELECT power('-2'::numeric, '3.3');
+ERROR:  a negative number raised to a non-integer power yields a complex result
+SELECT power('-2'::numeric, '-1');
+        power        
+---------------------
+ -0.5000000000000000
+(1 row)
+
+SELECT power('-2'::numeric, '-1.5');
+ERROR:  a negative number raised to a non-integer power yields a complex result
+SELECT power('-2'::numeric, 'inf');
+  power   
+----------
+ Infinity
+(1 row)
+
+SELECT power('-2'::numeric, '-inf');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('inf'::numeric, '-2');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('inf'::numeric, '-inf');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('-inf'::numeric, '2');
+  power   
+----------
+ Infinity
+(1 row)
+
+SELECT power('-inf'::numeric, '3');
+   power   
+-----------
+ -Infinity
+(1 row)
+
+SELECT power('-inf'::numeric, '4.5');
+ERROR:  a negative number raised to a non-integer power yields a complex result
+SELECT power('-inf'::numeric, '-2');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('-inf'::numeric, '-3');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('-inf'::numeric, '0');
+ power 
+-------
+     1
+(1 row)
+
+SELECT power('-inf'::numeric, 'inf');
+  power   
+----------
+ Infinity
+(1 row)
+
+SELECT power('-inf'::numeric, '-inf');
+ power 
+-------
+     0
+(1 row)
+
 -- ******************************
 -- * miscellaneous checks for things that have been broken in the past...
 -- ******************************
@@ -696,6 +1122,13 @@ ERROR:  numeric field overflow
 DETAIL:  A field with precision 4, scale 4 must round to an absolute value less than 1.
 INSERT INTO fract_only VALUES (7, '0.00001');
 INSERT INTO fract_only VALUES (8, '0.00017');
+INSERT INTO fract_only VALUES (9, 'NaN');
+INSERT INTO fract_only VALUES (10, 'Inf');	-- should fail
+ERROR:  numeric field overflow
+DETAIL:  A field with precision 4, scale 4 cannot hold an infinite value.
+INSERT INTO fract_only VALUES (11, '-Inf');	-- should fail
+ERROR:  numeric field overflow
+DETAIL:  A field with precision 4, scale 4 cannot hold an infinite value.
 SELECT * FROM fract_only;
  id |   val   
 ----+---------
@@ -705,7 +1138,8 @@ SELECT * FROM fract_only;
   5 |  0.9999
   7 |  0.0000
   8 |  0.0002
-(6 rows)
+  9 |     NaN
+(7 rows)
 
 DROP TABLE fract_only;
 -- Check inf/nan conversion behavior
@@ -716,9 +1150,35 @@ SELECT 'NaN'::float8::numeric;
 (1 row)
 
 SELECT 'Infinity'::float8::numeric;
-ERROR:  cannot convert infinity to numeric
+ numeric  
+----------
+ Infinity
+(1 row)
+
 SELECT '-Infinity'::float8::numeric;
-ERROR:  cannot convert infinity to numeric
+  numeric  
+-----------
+ -Infinity
+(1 row)
+
+SELECT 'NaN'::numeric::float8;
+ float8 
+--------
+    NaN
+(1 row)
+
+SELECT 'Infinity'::numeric::float8;
+  float8  
+----------
+ Infinity
+(1 row)
+
+SELECT '-Infinity'::numeric::float8;
+  float8   
+-----------
+ -Infinity
+(1 row)
+
 SELECT 'NaN'::float4::numeric;
  numeric 
 ---------
@@ -726,9 +1186,59 @@ SELECT 'NaN'::float4::numeric;
 (1 row)
 
 SELECT 'Infinity'::float4::numeric;
-ERROR:  cannot convert infinity to numeric
+ numeric  
+----------
+ Infinity
+(1 row)
+
 SELECT '-Infinity'::float4::numeric;
-ERROR:  cannot convert infinity to numeric
+  numeric  
+-----------
+ -Infinity
+(1 row)
+
+SELECT 'NaN'::numeric::float4;
+ float4 
+--------
+    NaN
+(1 row)
+
+SELECT 'Infinity'::numeric::float4;
+  float4  
+----------
+ Infinity
+(1 row)
+
+SELECT '-Infinity'::numeric::float4;
+  float4   
+-----------
+ -Infinity
+(1 row)
+
+SELECT '42'::int2::numeric;
+ numeric 
+---------
+      42
+(1 row)
+
+SELECT 'NaN'::numeric::int2;
+ERROR:  cannot convert NaN to smallint
+SELECT 'Infinity'::numeric::int2;
+ERROR:  cannot convert infinity to smallint
+SELECT '-Infinity'::numeric::int2;
+ERROR:  cannot convert infinity to smallint
+SELECT 'NaN'::numeric::int4;
+ERROR:  cannot convert NaN to integer
+SELECT 'Infinity'::numeric::int4;
+ERROR:  cannot convert infinity to integer
+SELECT '-Infinity'::numeric::int4;
+ERROR:  cannot convert infinity to integer
+SELECT 'NaN'::numeric::int8;
+ERROR:  cannot convert NaN to bigint
+SELECT 'Infinity'::numeric::int8;
+ERROR:  cannot convert infinity to bigint
+SELECT '-Infinity'::numeric::int8;
+ERROR:  cannot convert infinity to bigint
 -- Simple check that ceil(), floor(), and round() work correctly
 CREATE TABLE ceil_floor_round (a numeric);
 INSERT INTO ceil_floor_round VALUES ('-5.5');
@@ -794,6 +1304,12 @@ SELECT width_bucket('NaN', 3.0, 4.0, 888);
 ERROR:  operand, lower bound, and upper bound cannot be NaN
 SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888);
 ERROR:  operand, lower bound, and upper bound cannot be NaN
+SELECT width_bucket('inf', 3.0, 4.0, 888);
+ERROR:  operand, lower bound, and upper bound cannot be infinity
+SELECT width_bucket(2.0, 3.0, '-inf', 888);
+ERROR:  operand, lower bound, and upper bound cannot be infinity
+SELECT width_bucket(0::float8, '-inf', 4.0::float8, 888);
+ERROR:  lower and upper bounds must be finite
 -- normal operation
 CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
 COPY width_bucket_test (operand_num) FROM stdin;
@@ -1199,6 +1715,60 @@ SELECT '' AS to_char_23, to_char(val, '9.999EEEE')				FROM num_data;
             | -2.493e+07
 (10 rows)
 
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, '9.999EEEE') as numeric,
+  to_char(val::float8, '9.999EEEE') as float8,
+  to_char(val::float4, '9.999EEEE') as float4
+FROM v;
+    val     |  numeric   |   float8   |   float4   
+------------+------------+------------+------------
+          0 |  0.000e+00 |  0.000e+00 |  0.000e+00
+       -4.2 | -4.200e+00 | -4.200e+00 | -4.200e+00
+ 4200000000 |  4.200e+09 |  4.200e+09 |  4.200e+09
+   0.000012 |  1.200e-05 |  1.200e-05 |  1.200e-05
+   Infinity |  #.####### |  #.####### |  #.#######
+  -Infinity |  #.####### |  #.####### |  #.#######
+        NaN |  #.####### |  #.####### |  #.#######
+(7 rows)
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI9999999999.99') as numeric,
+  to_char(val::float8, 'MI9999999999.99') as float8,
+  to_char(val::float4, 'MI9999999999.99') as float4
+FROM v;
+    val     |    numeric     |     float8     |     float4     
+------------+----------------+----------------+----------------
+          0 |            .00 |            .00 |            .00
+       -4.2 | -         4.20 | -         4.20 | -         4.20
+ 4200000000 |  4200000000.00 |  4200000000.00 |  4200000000
+   0.000012 |            .00 |            .00 |            .00
+   Infinity |    Infinity    |    Infinity    |    Infinity
+  -Infinity | -  Infinity    | -  Infinity    | -  Infinity
+        NaN |         NaN    |         NaN    |         NaN
+(7 rows)
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI99.99') as numeric,
+  to_char(val::float8, 'MI99.99') as float8,
+  to_char(val::float4, 'MI99.99') as float4
+FROM v;
+    val     | numeric | float8 | float4 
+------------+---------+--------+--------
+          0 |    .00  |    .00 |    .00
+       -4.2 | - 4.20  | - 4.20 | - 4.20
+ 4200000000 |  ##.##  |  ##.## |  ##.
+   0.000012 |    .00  |    .00 |    .00
+   Infinity |  ##.##  |  ##.## |  ##.
+  -Infinity | -##.##  | -##.## | -##.
+        NaN |  ##.##  |  ##.## |  ##.##
+(7 rows)
+
 SELECT '' AS to_char_24, to_char('100'::numeric, 'FM999.9');
  to_char_24 | to_char 
 ------------+---------
@@ -1426,6 +1996,12 @@ INSERT INTO num_input_test(n1) VALUES ('555.50');
 INSERT INTO num_input_test(n1) VALUES ('-555.50');
 INSERT INTO num_input_test(n1) VALUES ('NaN ');
 INSERT INTO num_input_test(n1) VALUES ('        nan');
+INSERT INTO num_input_test(n1) VALUES (' inf ');
+INSERT INTO num_input_test(n1) VALUES (' +inf ');
+INSERT INTO num_input_test(n1) VALUES (' -inf ');
+INSERT INTO num_input_test(n1) VALUES (' Infinity ');
+INSERT INTO num_input_test(n1) VALUES (' +inFinity ');
+INSERT INTO num_input_test(n1) VALUES (' -INFINITY ');
 -- bad inputs
 INSERT INTO num_input_test(n1) VALUES ('     ');
 ERROR:  invalid input syntax for type numeric: "     "
@@ -1459,17 +2035,27 @@ INSERT INTO num_input_test(n1) VALUES (' N aN ');
 ERROR:  invalid input syntax for type numeric: " N aN "
 LINE 1: INSERT INTO num_input_test(n1) VALUES (' N aN ');
                                                ^
+INSERT INTO num_input_test(n1) VALUES ('+ infinity');
+ERROR:  invalid input syntax for type numeric: "+ infinity"
+LINE 1: INSERT INTO num_input_test(n1) VALUES ('+ infinity');
+                                               ^
 SELECT * FROM num_input_test;
-   n1    
----------
-     123
- 3245874
-  -93853
-  555.50
- -555.50
-     NaN
-     NaN
-(7 rows)
+    n1     
+-----------
+       123
+   3245874
+    -93853
+    555.50
+   -555.50
+       NaN
+       NaN
+  Infinity
+  Infinity
+ -Infinity
+  Infinity
+  Infinity
+ -Infinity
+(13 rows)
 
 --
 -- Test some corner cases for multiplication
@@ -1805,6 +2391,24 @@ select exp(1.0::numeric(71,70));
  2.7182818284590452353602874713526624977572470936999595749669676277240766
 (1 row)
 
+select exp('nan'::numeric);
+ exp 
+-----
+ NaN
+(1 row)
+
+select exp('inf'::numeric);
+   exp    
+----------
+ Infinity
+(1 row)
+
+select exp('-inf'::numeric);
+ exp 
+-----
+   0
+(1 row)
+
 -- cases that used to generate inaccurate results
 select exp(32.999);
          exp         
@@ -1876,6 +2480,12 @@ select * from generate_series('nan'::numeric, 100::numeric, 10::numeric);
 ERROR:  start value cannot be NaN
 select * from generate_series(0::numeric, 'nan'::numeric, 10::numeric);
 ERROR:  stop value cannot be NaN
+select * from generate_series('inf'::numeric, 'inf'::numeric, 10::numeric);
+ERROR:  start value cannot be infinity
+select * from generate_series(0::numeric, 'inf'::numeric, 10::numeric);
+ERROR:  stop value cannot be infinity
+select * from generate_series(0::numeric, '42'::numeric, '-inf'::numeric);
+ERROR:  step size cannot be infinity
 -- Checks maximum, output is truncated
 select (i / (10::numeric ^ 131071))::numeric(1,0)
 	from generate_series(6 * (10::numeric ^ 131071),
@@ -2081,6 +2691,12 @@ select scale(numeric 'NaN');
       
 (1 row)
 
+select scale(numeric 'inf');
+ scale 
+-------
+      
+(1 row)
+
 select scale(NULL::numeric);
  scale 
 -------
@@ -2138,6 +2754,12 @@ select min_scale(numeric 'NaN') is NULL; -- should be true
  t
 (1 row)
 
+select min_scale(numeric 'inf') is NULL; -- should be true
+ ?column? 
+----------
+ t
+(1 row)
+
 select min_scale(0);                     -- no digits
  min_scale 
 -----------
@@ -2207,6 +2829,12 @@ select trim_scale(numeric 'NaN');
         NaN
 (1 row)
 
+select trim_scale(numeric 'inf');
+ trim_scale 
+------------
+   Infinity
+(1 row)
+
 select trim_scale(1.120);
  trim_scale 
 ------------
@@ -2280,7 +2908,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (0::numeric, 46375::numeric),
              (433125::numeric, 46375::numeric),
              (43312.5::numeric, 4637.5::numeric),
-             (4331.250::numeric, 463.75000::numeric)) AS v(a, b);
+             (4331.250::numeric, 463.75000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
     a     |     b     |   gcd   |   gcd   |   gcd   |   gcd   
 ----------+-----------+---------+---------+---------+---------
         0 |         0 |       0 |       0 |       0 |       0
@@ -2289,7 +2921,10 @@ FROM (VALUES (0::numeric, 0::numeric),
    433125 |     46375 |     875 |     875 |     875 |     875
   43312.5 |    4637.5 |    87.5 |    87.5 |    87.5 |    87.5
  4331.250 | 463.75000 | 8.75000 | 8.75000 | 8.75000 | 8.75000
-(6 rows)
+ Infinity |         0 |     NaN |     NaN |     NaN |     NaN
+ Infinity |        42 |     NaN |     NaN |     NaN |     NaN
+ Infinity |  Infinity |     NaN |     NaN |     NaN |     NaN
+(9 rows)
 
 --
 -- Tests for LCM()
@@ -2301,7 +2936,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (13272::numeric, 13272::numeric),
              (423282::numeric, 13272::numeric),
              (42328.2::numeric, 1327.2::numeric),
-             (4232.820::numeric, 132.72000::numeric)) AS v(a, b);
+             (4232.820::numeric, 132.72000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
     a     |     b     |     lcm      |     lcm      |     lcm      |     lcm      
 ----------+-----------+--------------+--------------+--------------+--------------
         0 |         0 |            0 |            0 |            0 |            0
@@ -2311,7 +2950,10 @@ FROM (VALUES (0::numeric, 0::numeric),
    423282 |     13272 |     11851896 |     11851896 |     11851896 |     11851896
   42328.2 |    1327.2 |    1185189.6 |    1185189.6 |    1185189.6 |    1185189.6
  4232.820 | 132.72000 | 118518.96000 | 118518.96000 | 118518.96000 | 118518.96000
-(7 rows)
+ Infinity |         0 |          NaN |          NaN |          NaN |          NaN
+ Infinity |        42 |          NaN |          NaN |          NaN |          NaN
+ Infinity |  Infinity |          NaN |          NaN |          NaN |          NaN
+(10 rows)
 
 SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow
 ERROR:  value overflows numeric format
diff --git a/src/test/regress/expected/window.out b/src/test/regress/expected/window.out
index 432edfa0630ec..13c91c9916fa9 100644
--- a/src/test/regress/expected/window.out
+++ b/src/test/regress/expected/window.out
@@ -1872,7 +1872,7 @@ create temp table numerics(
     f_numeric numeric
 );
 insert into numerics values
-(0, '-infinity', '-infinity', '-1000'),  -- numeric type lacks infinities
+(0, '-infinity', '-infinity', '-infinity'),
 (1, -3, -3, -3),
 (2, -1, -1, -1),
 (3, 0, 0, 0),
@@ -1880,7 +1880,7 @@ insert into numerics values
 (5, 1.12, 1.12, 1.12),
 (6, 2, 2, 2),
 (7, 100, 100, 100),
-(8, 'infinity', 'infinity', '1000'),
+(8, 'infinity', 'infinity', 'infinity'),
 (9, 'NaN', 'NaN', 'NaN');
 select id, f_float4, first_value(id) over w, last_value(id) over w
 from numerics
@@ -2078,7 +2078,7 @@ window w as (order by f_numeric range between
              1 preceding and 1 following);
  id | f_numeric | first_value | last_value 
 ----+-----------+-------------+------------
-  0 |     -1000 |           0 |          0
+  0 | -Infinity |           0 |          0
   1 |        -3 |           1 |          1
   2 |        -1 |           2 |          3
   3 |         0 |           2 |          3
@@ -2086,7 +2086,7 @@ window w as (order by f_numeric range between
   5 |      1.12 |           4 |          6
   6 |         2 |           4 |          6
   7 |       100 |           7 |          7
-  8 |      1000 |           8 |          8
+  8 |  Infinity |           8 |          8
   9 |       NaN |           9 |          9
 (10 rows)
 
@@ -2096,7 +2096,7 @@ window w as (order by f_numeric range between
              1 preceding and 1.1::numeric following);
  id | f_numeric | first_value | last_value 
 ----+-----------+-------------+------------
-  0 |     -1000 |           0 |          0
+  0 | -Infinity |           0 |          0
   1 |        -3 |           1 |          1
   2 |        -1 |           2 |          3
   3 |         0 |           2 |          4
@@ -2104,7 +2104,7 @@ window w as (order by f_numeric range between
   5 |      1.12 |           4 |          6
   6 |         2 |           4 |          6
   7 |       100 |           7 |          7
-  8 |      1000 |           8 |          8
+  8 |  Infinity |           8 |          8
   9 |       NaN |           9 |          9
 (10 rows)
 
@@ -2116,6 +2116,60 @@ ERROR:  RANGE with offset PRECEDING/FOLLOWING is not supported for column type n
 LINE 4:              1 preceding and 1.1::float8 following);
                                      ^
 HINT:  Cast the offset value to an appropriate type.
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' following);
+ id | f_numeric | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          8
+  1 |        -3 |           0 |          8
+  2 |        -1 |           0 |          8
+  3 |         0 |           0 |          8
+  4 |       1.1 |           0 |          8
+  5 |      1.12 |           0 |          8
+  6 |         2 |           0 |          8
+  7 |       100 |           0 |          8
+  8 |  Infinity |           0 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' preceding);
+ id | f_numeric | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          0
+  1 |        -3 |           0 |          0
+  2 |        -1 |           0 |          0
+  3 |         0 |           0 |          0
+  4 |       1.1 |           0 |          0
+  5 |      1.12 |           0 |          0
+  6 |         2 |           0 |          0
+  7 |       100 |           0 |          0
+  8 |  Infinity |           0 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' following and 'inf' following);
+ id | f_numeric | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          8
+  1 |        -3 |           8 |          8
+  2 |        -1 |           8 |          8
+  3 |         0 |           8 |          8
+  4 |       1.1 |           8 |          8
+  5 |      1.12 |           8 |          8
+  6 |         2 |           8 |          8
+  7 |       100 |           8 |          8
+  8 |  Infinity |           8 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
 select id, f_numeric, first_value(id) over w, last_value(id) over w
 from numerics
 window w as (order by f_numeric range between
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 044d5155073c0..54f5cf7ecc43a 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -53,6 +53,8 @@ SELECT var_pop('nan'::float4), var_samp('nan'::float4);
 SELECT stddev_pop('nan'::float4), stddev_samp('nan'::float4);
 SELECT var_pop(1.0::numeric), var_samp(2.0::numeric);
 SELECT stddev_pop(3.0::numeric), stddev_samp(4.0::numeric);
+SELECT var_pop('inf'::numeric), var_samp('inf'::numeric);
+SELECT stddev_pop('inf'::numeric), stddev_samp('inf'::numeric);
 SELECT var_pop('nan'::numeric), var_samp('nan'::numeric);
 SELECT stddev_pop('nan'::numeric), stddev_samp('nan'::numeric);
 
@@ -69,14 +71,26 @@ select sum('NaN'::numeric) from generate_series(1,3);
 select avg('NaN'::numeric) from generate_series(1,3);
 
 -- verify correct results for infinite inputs
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('1'), ('infinity')) v(x);
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('1')) v(x);
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('infinity')) v(x);
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('infinity')) v(x);
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('1'), ('infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('1')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
 FROM (VALUES ('-infinity'), ('infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
 
 -- test accuracy with a large input offset
 SELECT avg(x::float8), var_pop(x::float8)
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 5dc80f686f481..416c16722a9ce 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -634,6 +634,119 @@ SELECT t1.id1, t1.result, t2.expected
     WHERE t1.id1 = t2.id
     AND t1.result != t2.expected;
 
+-- ******************************
+-- * Check behavior with Inf and NaN inputs.  It's easiest to handle these
+-- * separately from the num_data framework used above, because some input
+-- * combinations will throw errors.
+-- ******************************
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 + x2 AS sum,
+  x1 - x2 AS diff,
+  x1 * x2 AS prod
+FROM v AS v1(x1), v AS v2(x2);
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 / x2 AS quot,
+  x1 % x2 AS mod,
+  div(x1, x2) AS div
+FROM v AS v1(x1), v AS v2(x2) WHERE x2 != 0;
+
+SELECT 'inf'::numeric / '0';
+SELECT '-inf'::numeric / '0';
+SELECT 'nan'::numeric / '0';
+SELECT '0'::numeric / '0';
+SELECT 'inf'::numeric % '0';
+SELECT '-inf'::numeric % '0';
+SELECT 'nan'::numeric % '0';
+SELECT '0'::numeric % '0';
+SELECT div('inf'::numeric, '0');
+SELECT div('-inf'::numeric, '0');
+SELECT div('nan'::numeric, '0');
+SELECT div('0'::numeric, '0');
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, -x as minusx, abs(x), floor(x), ceil(x), sign(x), numeric_inc(x) as inc
+FROM v;
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, round(x), round(x,1) as round1, trunc(x), trunc(x,1) as trunc1
+FROM v;
+
+-- the large values fall into the numeric abbreviation code's maximal classes
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('1e340'),('-1e340'),
+         ('inf'),('-inf'),('nan'),
+         ('inf'),('-inf'),('nan'))
+SELECT substring(x::text, 1, 32)
+FROM v ORDER BY x;
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('4.2'),('inf'),('nan'))
+SELECT x, sqrt(x)
+FROM v;
+
+SELECT sqrt('-1'::numeric);
+SELECT sqrt('-inf'::numeric);
+
+WITH v(x) AS
+  (VALUES('1'::numeric),('4.2'),('inf'),('nan'))
+SELECT x,
+  log(x),
+  log10(x),
+  ln(x)
+FROM v;
+
+SELECT ln('0'::numeric);
+SELECT ln('-1'::numeric);
+SELECT ln('-inf'::numeric);
+
+WITH v(x) AS
+  (VALUES('2'::numeric),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  log(x1, x2)
+FROM v AS v1(x1), v AS v2(x2);
+
+SELECT log('0'::numeric, '10');
+SELECT log('10'::numeric, '0');
+SELECT log('-inf'::numeric, '10');
+SELECT log('10'::numeric, '-inf');
+SELECT log('inf'::numeric, '0');
+SELECT log('inf'::numeric, '-inf');
+SELECT log('-inf'::numeric, 'inf');
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('2'),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  power(x1, x2)
+FROM v AS v1(x1), v AS v2(x2) WHERE x1 != 0 OR x2 >= 0;
+
+SELECT power('0'::numeric, '-1');
+SELECT power('0'::numeric, '-inf');
+SELECT power('-1'::numeric, 'inf');
+SELECT power('-2'::numeric, '3');
+SELECT power('-2'::numeric, '3.3');
+SELECT power('-2'::numeric, '-1');
+SELECT power('-2'::numeric, '-1.5');
+SELECT power('-2'::numeric, 'inf');
+SELECT power('-2'::numeric, '-inf');
+SELECT power('inf'::numeric, '-2');
+SELECT power('inf'::numeric, '-inf');
+SELECT power('-inf'::numeric, '2');
+SELECT power('-inf'::numeric, '3');
+SELECT power('-inf'::numeric, '4.5');
+SELECT power('-inf'::numeric, '-2');
+SELECT power('-inf'::numeric, '-3');
+SELECT power('-inf'::numeric, '0');
+SELECT power('-inf'::numeric, 'inf');
+SELECT power('-inf'::numeric, '-inf');
+
 -- ******************************
 -- * miscellaneous checks for things that have been broken in the past...
 -- ******************************
@@ -652,6 +765,9 @@ INSERT INTO fract_only VALUES (5, '0.99994');
 INSERT INTO fract_only VALUES (6, '0.99995');  -- should fail
 INSERT INTO fract_only VALUES (7, '0.00001');
 INSERT INTO fract_only VALUES (8, '0.00017');
+INSERT INTO fract_only VALUES (9, 'NaN');
+INSERT INTO fract_only VALUES (10, 'Inf');	-- should fail
+INSERT INTO fract_only VALUES (11, '-Inf');	-- should fail
 SELECT * FROM fract_only;
 DROP TABLE fract_only;
 
@@ -659,9 +775,25 @@ DROP TABLE fract_only;
 SELECT 'NaN'::float8::numeric;
 SELECT 'Infinity'::float8::numeric;
 SELECT '-Infinity'::float8::numeric;
+SELECT 'NaN'::numeric::float8;
+SELECT 'Infinity'::numeric::float8;
+SELECT '-Infinity'::numeric::float8;
 SELECT 'NaN'::float4::numeric;
 SELECT 'Infinity'::float4::numeric;
 SELECT '-Infinity'::float4::numeric;
+SELECT 'NaN'::numeric::float4;
+SELECT 'Infinity'::numeric::float4;
+SELECT '-Infinity'::numeric::float4;
+SELECT '42'::int2::numeric;
+SELECT 'NaN'::numeric::int2;
+SELECT 'Infinity'::numeric::int2;
+SELECT '-Infinity'::numeric::int2;
+SELECT 'NaN'::numeric::int4;
+SELECT 'Infinity'::numeric::int4;
+SELECT '-Infinity'::numeric::int4;
+SELECT 'NaN'::numeric::int8;
+SELECT 'Infinity'::numeric::int8;
+SELECT '-Infinity'::numeric::int8;
 
 -- Simple check that ceil(), floor(), and round() work correctly
 CREATE TABLE ceil_floor_round (a numeric);
@@ -697,6 +829,9 @@ SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5);
 SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888);
 SELECT width_bucket('NaN', 3.0, 4.0, 888);
 SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888);
+SELECT width_bucket('inf', 3.0, 4.0, 888);
+SELECT width_bucket(2.0, 3.0, '-inf', 888);
+SELECT width_bucket(0::float8, '-inf', 4.0::float8, 888);
 
 -- normal operation
 CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
@@ -782,6 +917,30 @@ SELECT '' AS to_char_21, to_char(val, '999999SG9999999999')			FROM num_data;
 SELECT '' AS to_char_22, to_char(val, 'FM9999999999999999.999999999999999')	FROM num_data;
 SELECT '' AS to_char_23, to_char(val, '9.999EEEE')				FROM num_data;
 
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, '9.999EEEE') as numeric,
+  to_char(val::float8, '9.999EEEE') as float8,
+  to_char(val::float4, '9.999EEEE') as float4
+FROM v;
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI9999999999.99') as numeric,
+  to_char(val::float8, 'MI9999999999.99') as float8,
+  to_char(val::float4, 'MI9999999999.99') as float4
+FROM v;
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI99.99') as numeric,
+  to_char(val::float8, 'MI99.99') as float8,
+  to_char(val::float4, 'MI99.99') as float4
+FROM v;
+
 SELECT '' AS to_char_24, to_char('100'::numeric, 'FM999.9');
 SELECT '' AS to_char_25, to_char('100'::numeric, 'FM999.');
 SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999');
@@ -839,6 +998,12 @@ INSERT INTO num_input_test(n1) VALUES ('555.50');
 INSERT INTO num_input_test(n1) VALUES ('-555.50');
 INSERT INTO num_input_test(n1) VALUES ('NaN ');
 INSERT INTO num_input_test(n1) VALUES ('        nan');
+INSERT INTO num_input_test(n1) VALUES (' inf ');
+INSERT INTO num_input_test(n1) VALUES (' +inf ');
+INSERT INTO num_input_test(n1) VALUES (' -inf ');
+INSERT INTO num_input_test(n1) VALUES (' Infinity ');
+INSERT INTO num_input_test(n1) VALUES (' +inFinity ');
+INSERT INTO num_input_test(n1) VALUES (' -INFINITY ');
 
 -- bad inputs
 INSERT INTO num_input_test(n1) VALUES ('     ');
@@ -849,6 +1014,7 @@ INSERT INTO num_input_test(n1) VALUES ('5 . 0');
 INSERT INTO num_input_test(n1) VALUES ('5. 0   ');
 INSERT INTO num_input_test(n1) VALUES ('');
 INSERT INTO num_input_test(n1) VALUES (' N aN ');
+INSERT INTO num_input_test(n1) VALUES ('+ infinity');
 
 SELECT * FROM num_input_test;
 
@@ -952,6 +1118,9 @@ select 1.234 ^ 5678;
 select exp(0.0);
 select exp(1.0);
 select exp(1.0::numeric(71,70));
+select exp('nan'::numeric);
+select exp('inf'::numeric);
+select exp('-inf'::numeric);
 
 -- cases that used to generate inaccurate results
 select exp(32.999);
@@ -973,6 +1142,9 @@ select * from generate_series(-100::numeric, 100::numeric, 0::numeric);
 select * from generate_series(-100::numeric, 100::numeric, 'nan'::numeric);
 select * from generate_series('nan'::numeric, 100::numeric, 10::numeric);
 select * from generate_series(0::numeric, 'nan'::numeric, 10::numeric);
+select * from generate_series('inf'::numeric, 'inf'::numeric, 10::numeric);
+select * from generate_series(0::numeric, 'inf'::numeric, 10::numeric);
+select * from generate_series(0::numeric, '42'::numeric, '-inf'::numeric);
 -- Checks maximum, output is truncated
 select (i / (10::numeric ^ 131071))::numeric(1,0)
 	from generate_series(6 * (10::numeric ^ 131071),
@@ -1040,6 +1212,7 @@ select log(3.1954752e47, 9.4792021e-73);
 --
 
 select scale(numeric 'NaN');
+select scale(numeric 'inf');
 select scale(NULL::numeric);
 select scale(1.12);
 select scale(0);
@@ -1054,6 +1227,7 @@ select scale(-13.000000000000000);
 --
 
 select min_scale(numeric 'NaN') is NULL; -- should be true
+select min_scale(numeric 'inf') is NULL; -- should be true
 select min_scale(0);                     -- no digits
 select min_scale(0.00);                  -- no digits again
 select min_scale(1.0);                   -- no scale
@@ -1070,6 +1244,7 @@ select min_scale(1e100);                 -- very big number
 --
 
 select trim_scale(numeric 'NaN');
+select trim_scale(numeric 'inf');
 select trim_scale(1.120);
 select trim_scale(0);
 select trim_scale(0.00);
@@ -1096,7 +1271,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (0::numeric, 46375::numeric),
              (433125::numeric, 46375::numeric),
              (43312.5::numeric, 4637.5::numeric),
-             (4331.250::numeric, 463.75000::numeric)) AS v(a, b);
+             (4331.250::numeric, 463.75000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
 
 --
 -- Tests for LCM()
@@ -1108,7 +1287,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (13272::numeric, 13272::numeric),
              (423282::numeric, 13272::numeric),
              (42328.2::numeric, 1327.2::numeric),
-             (4232.820::numeric, 132.72000::numeric)) AS v(a, b);
+             (4232.820::numeric, 132.72000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
 
 SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow
 
diff --git a/src/test/regress/sql/window.sql b/src/test/regress/sql/window.sql
index 51ec0bac9ad14..af206ca4664e2 100644
--- a/src/test/regress/sql/window.sql
+++ b/src/test/regress/sql/window.sql
@@ -499,7 +499,7 @@ create temp table numerics(
 );
 
 insert into numerics values
-(0, '-infinity', '-infinity', '-1000'),  -- numeric type lacks infinities
+(0, '-infinity', '-infinity', '-infinity'),
 (1, -3, -3, -3),
 (2, -1, -1, -1),
 (3, 0, 0, 0),
@@ -507,7 +507,7 @@ insert into numerics values
 (5, 1.12, 1.12, 1.12),
 (6, 2, 2, 2),
 (7, 100, 100, 100),
-(8, 'infinity', 'infinity', '1000'),
+(8, 'infinity', 'infinity', 'infinity'),
 (9, 'NaN', 'NaN', 'NaN');
 
 select id, f_float4, first_value(id) over w, last_value(id) over w
@@ -574,6 +574,18 @@ window w as (order by f_numeric range between
              1 preceding and 1.1::float8 following);  -- currently unsupported
 select id, f_numeric, first_value(id) over w, last_value(id) over w
 from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' following);
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' preceding);
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' following and 'inf' following);
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
 window w as (order by f_numeric range between
              1.1 preceding and 'NaN' following);  -- error, NaN disallowed
 

From 38f60f174e3279069b5547b5f4015eb4a8492037 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 23 Jul 2020 08:29:08 +0900
Subject: [PATCH 02/54] Revert "Fix corner case with PGP decompression in
 pgcrypto"

This reverts commit 9e10898, after finding out that buildfarm members
running SLES 15 on z390 complain on the compression and decompression
logic of the new test: pipistrelles, barbthroat and steamerduck.

Those hosts are visibly using hardware-specific changes to improve zlib
performance, requiring more investigation.

Thanks to Tom Lane for the discussion.

Discussion: https://postgr.es/m/20200722093749.GA2564@paquier.xyz
Backpatch-through: 9.5
---
 contrib/pgcrypto/expected/pgp-compression.out | 30 -------------------
 contrib/pgcrypto/pgp-compress.c               | 22 +++++++-------
 contrib/pgcrypto/sql/pgp-compression.sql      | 21 -------------
 3 files changed, 11 insertions(+), 62 deletions(-)

diff --git a/contrib/pgcrypto/expected/pgp-compression.out b/contrib/pgcrypto/expected/pgp-compression.out
index d4c57feba30b8..32b350b8fe05b 100644
--- a/contrib/pgcrypto/expected/pgp-compression.out
+++ b/contrib/pgcrypto/expected/pgp-compression.out
@@ -48,33 +48,3 @@ select pgp_sym_decrypt(
  Secret message
 (1 row)
 
--- check corner case involving an input string of 16kB, as per bug #16476.
-SELECT setseed(0);
- setseed 
----------
- 
-(1 row)
-
-WITH random_string AS
-(
-  -- This generates a random string of 16366 bytes.  This is chosen
-  -- as random so that it does not get compressed, and the decompression
-  -- would work on a string with the same length as the origin, making the
-  -- test behavior more predictible.  lpad() ensures that the generated
-  -- hexadecimal value is completed by extra zero characters if random()
-  -- has generated a value strictly lower than 16.
-  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
-    FROM generate_series(0, 16365)
-)
-SELECT bytes =
-    pgp_sym_decrypt_bytea(
-      pgp_sym_encrypt_bytea(bytes, 'key',
-                            'compress-algo=1,compress-level=1'),
-                            'key', 'expect-compress-algo=1')
-    AS is_same
-  FROM random_string;
- is_same 
----------
- t
-(1 row)
-
diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c
index 17f5b2ef93dc1..0505bdee9237f 100644
--- a/contrib/pgcrypto/pgp-compress.c
+++ b/contrib/pgcrypto/pgp-compress.c
@@ -243,17 +243,6 @@ decompress_read(void *priv, PullFilter *src, int len,
 	struct DecomprData *dec = priv;
 
 restart:
-	if (dec->stream.avail_in == 0)
-	{
-		uint8	   *tmp;
-
-		res = pullf_read(src, 8192, &tmp);
-		if (res < 0)
-			return res;
-		dec->stream.next_in = tmp;
-		dec->stream.avail_in = res;
-	}
-
 	if (dec->buf_data > 0)
 	{
 		if (len > dec->buf_data)
@@ -267,6 +256,17 @@ decompress_read(void *priv, PullFilter *src, int len,
 	if (dec->eof)
 		return 0;
 
+	if (dec->stream.avail_in == 0)
+	{
+		uint8	   *tmp;
+
+		res = pullf_read(src, 8192, &tmp);
+		if (res < 0)
+			return res;
+		dec->stream.next_in = tmp;
+		dec->stream.avail_in = res;
+	}
+
 	dec->stream.next_out = dec->buf;
 	dec->stream.avail_out = dec->buf_len;
 	dec->pos = dec->buf;
diff --git a/contrib/pgcrypto/sql/pgp-compression.sql b/contrib/pgcrypto/sql/pgp-compression.sql
index 87c59c6cabc4e..ca9ee1fc0088a 100644
--- a/contrib/pgcrypto/sql/pgp-compression.sql
+++ b/contrib/pgcrypto/sql/pgp-compression.sql
@@ -28,24 +28,3 @@ select pgp_sym_decrypt(
 	pgp_sym_encrypt('Secret message', 'key',
 			'compress-algo=2, compress-level=0'),
 	'key', 'expect-compress-algo=0');
-
--- check corner case involving an input string of 16kB, as per bug #16476.
-SELECT setseed(0);
-WITH random_string AS
-(
-  -- This generates a random string of 16366 bytes.  This is chosen
-  -- as random so that it does not get compressed, and the decompression
-  -- would work on a string with the same length as the origin, making the
-  -- test behavior more predictible.  lpad() ensures that the generated
-  -- hexadecimal value is completed by extra zero characters if random()
-  -- has generated a value strictly lower than 16.
-  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
-    FROM generate_series(0, 16365)
-)
-SELECT bytes =
-    pgp_sym_decrypt_bytea(
-      pgp_sym_encrypt_bytea(bytes, 'key',
-                            'compress-algo=1,compress-level=1'),
-                            'key', 'expect-compress-algo=1')
-    AS is_same
-  FROM random_string;

From c55040ccd017962b7b8d3fbcdc184aa90c722a21 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Thu, 23 Jul 2020 08:19:07 +0530
Subject: [PATCH 03/54] WAL Log invalidations at command end with
 wal_level=logical.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When wal_level=logical, write invalidations at command end into WAL so
that decoding can use this information.

This patch is required to allow the streaming of in-progress transactions
in logical decoding.  The actual work to allow streaming will be committed
as a separate patch.

We still add the invalidations to the cache and write them to WAL at
commit time in RecordTransactionCommit(). This uses the existing
XLOG_INVALIDATIONS xlog record type, from the RM_STANDBY_ID resource
manager (see LogStandbyInvalidations for details).

So existing code relying on those invalidations (e.g. redo) does not need
to be changed.

The invalidations written at command end uses a new xlog record type
XLOG_XACT_INVALIDATIONS, from RM_XACT_ID resource manager. See
LogLogicalInvalidations for details.

These new xlog records are ignored by existing redo procedures, which
still rely on the invalidations written to commit records.

The invalidations are decoded and accumulated in top-transaction, and then
executed during replay.  This obviates the need to decode the
invalidations as part of a commit record.

Bump XLOG_PAGE_MAGIC, since this introduces XLOG_XACT_INVALIDATIONS.

Author: Dilip Kumar, Tomas Vondra, Amit Kapila
Reviewed-by: Amit Kapila
Tested-by: Neha Sharma and Mahendra Singh Thalor
Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
---
 src/backend/access/rmgrdesc/xactdesc.c        | 10 ++++
 src/backend/access/transam/xact.c             | 17 ++++++
 src/backend/replication/logical/decode.c      | 58 +++++++++++--------
 .../replication/logical/reorderbuffer.c       | 52 ++++++++++++++---
 src/backend/utils/cache/inval.c               | 54 +++++++++++++++++
 src/include/access/xact.h                     |  2 +-
 src/include/access/xlog_internal.h            |  2 +-
 src/include/replication/reorderbuffer.h       |  3 +
 src/include/utils/inval.h                     |  2 +
 9 files changed, 166 insertions(+), 34 deletions(-)

diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c
index 9fce75565f4b3..addd95faec140 100644
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -396,6 +396,13 @@ xact_desc(StringInfo buf, XLogReaderState *record)
 		appendStringInfo(buf, "xtop %u: ", xlrec->xtop);
 		xact_desc_assignment(buf, xlrec);
 	}
+	else if (info == XLOG_XACT_INVALIDATIONS)
+	{
+		xl_xact_invals *xlrec = (xl_xact_invals *) rec;
+
+		standby_desc_invalidations(buf, xlrec->nmsgs, xlrec->msgs, InvalidOid,
+								   InvalidOid, false);
+	}
 }
 
 const char *
@@ -423,6 +430,9 @@ xact_identify(uint8 info)
 		case XLOG_XACT_ASSIGNMENT:
 			id = "ASSIGNMENT";
 			break;
+		case XLOG_XACT_INVALIDATIONS:
+			id = "INVALIDATION";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index bd4c3cf32585f..d4f7c29847f43 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1224,6 +1224,16 @@ RecordTransactionCommit(void)
 	bool		RelcacheInitFileInval = false;
 	bool		wrote_xlog;
 
+	/*
+	 * Log pending invalidations for logical decoding of in-progress
+	 * transactions.  Normally for DDLs, we log this at each command end,
+	 * however, for certain cases where we directly update the system table
+	 * without a transaction block, the invalidations are not logged till this
+	 * time.
+	 */
+	if (XLogLogicalInfoActive())
+		LogLogicalInvalidations();
+
 	/* Get data needed for commit record */
 	nrels = smgrGetPendingDeletes(true, &rels);
 	nchildren = xactGetCommittedChildren(&children);
@@ -6022,6 +6032,13 @@ xact_redo(XLogReaderState *record)
 			ProcArrayApplyXidAssignment(xlrec->xtop,
 										xlrec->nsubxacts, xlrec->xsub);
 	}
+	else if (info == XLOG_XACT_INVALIDATIONS)
+	{
+		/*
+		 * XXX we do ignore this for now, what matters are invalidations
+		 * written into the commit record.
+		 */
+	}
 	else
 		elog(PANIC, "xact_redo: unknown op code %u", info);
 }
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 0c0c371739199..f3a1c31a2921c 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -278,10 +278,39 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
 			/*
 			 * We assign subxact to the toplevel xact while processing each
-			 * record if required.  So, we don't need to do anything here.
-			 * See LogicalDecodingProcessRecord.
+			 * record if required.  So, we don't need to do anything here. See
+			 * LogicalDecodingProcessRecord.
 			 */
 			break;
+		case XLOG_XACT_INVALIDATIONS:
+			{
+				TransactionId xid;
+				xl_xact_invals *invals;
+
+				xid = XLogRecGetXid(r);
+				invals = (xl_xact_invals *) XLogRecGetData(r);
+
+				/*
+				 * Execute the invalidations for xid-less transactions,
+				 * otherwise, accumulate them so that they can be processed at
+				 * the commit time.
+				 */
+				if (TransactionIdIsValid(xid))
+				{
+					if (!ctx->fast_forward)
+						ReorderBufferAddInvalidations(reorder, xid,
+													  buf->origptr,
+													  invals->nmsgs,
+													  invals->msgs);
+					ReorderBufferXidSetCatalogChanges(ctx->reorder, xid,
+													  buf->origptr);
+				}
+				else if ((!ctx->fast_forward))
+					ReorderBufferImmediateInvalidation(ctx->reorder,
+													   invals->nmsgs,
+													   invals->msgs);
+			}
+			break;
 		case XLOG_XACT_PREPARE:
 
 			/*
@@ -334,15 +363,11 @@ DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_STANDBY_LOCK:
 			break;
 		case XLOG_INVALIDATIONS:
-			{
-				xl_invalidations *invalidations =
-				(xl_invalidations *) XLogRecGetData(r);
 
-				if (!ctx->fast_forward)
-					ReorderBufferImmediateInvalidation(ctx->reorder,
-													   invalidations->nmsgs,
-													   invalidations->msgs);
-			}
+			/*
+			 * We are processing the invalidations at the command level via
+			 * XLOG_XACT_INVALIDATIONS.  So we don't need to do anything here.
+			 */
 			break;
 		default:
 			elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
@@ -573,19 +598,6 @@ DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
 		commit_time = parsed->origin_timestamp;
 	}
 
-	/*
-	 * Process invalidation messages, even if we're not interested in the
-	 * transaction's contents, since the various caches need to always be
-	 * consistent.
-	 */
-	if (parsed->nmsgs > 0)
-	{
-		if (!ctx->fast_forward)
-			ReorderBufferAddInvalidations(ctx->reorder, xid, buf->origptr,
-										  parsed->nmsgs, parsed->msgs);
-		ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
-	}
-
 	SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid,
 					   parsed->nsubxacts, parsed->subxacts);
 
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 449327a147f92..ce6e62152f037 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -856,6 +856,9 @@ ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
 	subtxn->toplevel_xid = xid;
 	Assert(subtxn->nsubtxns == 0);
 
+	/* set the reference to top-level transaction */
+	subtxn->toptxn = txn;
+
 	/* add to subtransaction list */
 	dlist_push_tail(&txn->subtxns, &subtxn->node);
 	txn->nsubtxns++;
@@ -2201,7 +2204,11 @@ ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
 /*
  * Setup the invalidation of the toplevel transaction.
  *
- * This needs to be done before ReorderBufferCommit is called!
+ * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
+ * accumulates all the invalidation messages in the toplevel transaction.
+ * This is required because in some cases where we skip processing the
+ * transaction (see ReorderBufferForget), we need to execute all the
+ * invalidations together.
  */
 void
 ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
@@ -2212,17 +2219,35 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
 
 	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
 
-	if (txn->ninvalidations != 0)
-		elog(ERROR, "only ever add one set of invalidations");
+	/*
+	 * We collect all the invalidations under the top transaction so that we
+	 * can execute them all together.
+	 */
+	if (txn->toptxn)
+		txn = txn->toptxn;
 
 	Assert(nmsgs > 0);
 
-	txn->ninvalidations = nmsgs;
-	txn->invalidations = (SharedInvalidationMessage *)
-		MemoryContextAlloc(rb->context,
-						   sizeof(SharedInvalidationMessage) * nmsgs);
-	memcpy(txn->invalidations, msgs,
-		   sizeof(SharedInvalidationMessage) * nmsgs);
+	/* Accumulate invalidations. */
+	if (txn->ninvalidations == 0)
+	{
+		txn->ninvalidations = nmsgs;
+		txn->invalidations = (SharedInvalidationMessage *)
+			MemoryContextAlloc(rb->context,
+							   sizeof(SharedInvalidationMessage) * nmsgs);
+		memcpy(txn->invalidations, msgs,
+			   sizeof(SharedInvalidationMessage) * nmsgs);
+	}
+	else
+	{
+		txn->invalidations = (SharedInvalidationMessage *)
+			repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) *
+					 (txn->ninvalidations + nmsgs));
+
+		memcpy(txn->invalidations + txn->ninvalidations, msgs,
+			   nmsgs * sizeof(SharedInvalidationMessage));
+		txn->ninvalidations += nmsgs;
+	}
 }
 
 /*
@@ -2250,6 +2275,15 @@ ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
 	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
 
 	txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
+
+	/*
+	 * Mark top-level transaction as having catalog changes too if one of its
+	 * children has so that the ReorderBufferBuildTupleCidHash can
+	 * conveniently check just top-level transaction and decide whether to
+	 * build the hash table or not.
+	 */
+	if (txn->toptxn != NULL)
+		txn->toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
 }
 
 /*
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 591dd33be6786..628d6f5d0cceb 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -85,6 +85,9 @@
  *	worth trying to avoid sending such inval traffic in the future, if those
  *	problems can be overcome cheaply.
  *
+ *	When wal_level=logical, write invalidations into WAL at each command end to
+ *	support the decoding of the in-progress transactions.  See
+ *	CommandEndInvalidationMessages.
  *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -1094,6 +1097,11 @@ CommandEndInvalidationMessages(void)
 
 	ProcessInvalidationMessages(&transInvalInfo->CurrentCmdInvalidMsgs,
 								LocalExecuteInvalidationMessage);
+
+	/* WAL Log per-command invalidation messages for wal_level=logical */
+	if (XLogLogicalInfoActive())
+		LogLogicalInvalidations();
+
 	AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
 							   &transInvalInfo->CurrentCmdInvalidMsgs);
 }
@@ -1501,3 +1509,49 @@ CallSyscacheCallbacks(int cacheid, uint32 hashvalue)
 		i = ccitem->link - 1;
 	}
 }
+
+/*
+ * LogLogicalInvalidations
+ *
+ * Emit WAL for invalidations.  This is currently only used for logging
+ * invalidations at the command end or at commit time if any invalidations
+ * are pending.
+ */
+void
+LogLogicalInvalidations()
+{
+	xl_xact_invals xlrec;
+	SharedInvalidationMessage *invalMessages;
+	int			nmsgs = 0;
+
+	/* Quick exit if we haven't done anything with invalidation messages. */
+	if (transInvalInfo == NULL)
+		return;
+
+	ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+									 MakeSharedInvalidMessagesArray);
+
+	Assert(!(numSharedInvalidMessagesArray > 0 &&
+			 SharedInvalidMessagesArray == NULL));
+
+	invalMessages = SharedInvalidMessagesArray;
+	nmsgs = numSharedInvalidMessagesArray;
+	SharedInvalidMessagesArray = NULL;
+	numSharedInvalidMessagesArray = 0;
+
+	if (nmsgs > 0)
+	{
+		/* prepare record */
+		memset(&xlrec, 0, MinSizeOfXactInvals);
+		xlrec.nmsgs = nmsgs;
+
+		/* perform insertion */
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&xlrec), MinSizeOfXactInvals);
+		XLogRegisterData((char *) invalMessages,
+						 nmsgs * sizeof(SharedInvalidationMessage));
+		XLogInsert(RM_XACT_ID, XLOG_XACT_INVALIDATIONS);
+
+		pfree(invalMessages);
+	}
+}
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index aef8555367449..53480116a4622 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -146,7 +146,7 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
 #define XLOG_XACT_COMMIT_PREPARED	0x30
 #define XLOG_XACT_ABORT_PREPARED	0x40
 #define XLOG_XACT_ASSIGNMENT		0x50
-/* free opcode 0x60 */
+#define XLOG_XACT_INVALIDATIONS		0x60
 /* free opcode 0x70 */
 
 /* mask for filtering opcodes out of xl_info */
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index b9490a3afeff2..9b2da56379e15 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD107	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD108	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 019bd382de9b9..1055e99e2e140 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -220,6 +220,9 @@ typedef struct ReorderBufferTXN
 	 */
 	XLogRecPtr	end_lsn;
 
+	/* Toplevel transaction for this subxact (NULL for top-level). */
+	struct ReorderBufferTXN *toptxn;
+
 	/*
 	 * LSN of the last lsn at which snapshot information reside, so we can
 	 * restart decoding from there and fully recover this transaction from
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index bc5081cf7210b..463888c3894f9 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -61,4 +61,6 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
 
 extern void InvalidateSystemCaches(void);
+
+extern void LogLogicalInvalidations(void);
 #endif							/* INVAL_H */

From 42dee8b8e362bae05de2234a4fc7a3aa9dacdf6f Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 23 Jul 2020 21:10:49 +1200
Subject: [PATCH 04/54] Fix error message.

Remove extra space.  Back-patch to all releases, like commit 7897e3bb.

Author: Lu, Chenyang <lucy.fnst@cn.fujitsu.com>
Discussion: https://postgr.es/m/795d03c6129844d3803e7eea48f5af0d%40G08CNEXMBPEKD04.g08.fujitsu.local
---
 src/backend/storage/file/buffile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 3907349b691ea..2d7a08232089d 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -499,7 +499,7 @@ BufFileDumpBuffer(BufFile *file)
 		if (bytestowrite <= 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
-					 errmsg("could not write to file \"%s\" : %m",
+					 errmsg("could not write to file \"%s\": %m",
 							FilePathName(thisfile))));
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;

From 5733fa0fe4a73efa46801aa4189f7da17dd2b4bf Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Thu, 23 Jul 2020 17:13:00 +0200
Subject: [PATCH 05/54] doc: Document that ssl_ciphers does not affect TLS 1.3

TLS 1.3 uses a different way of specifying ciphers and a different
OpenSSL API.  PostgreSQL currently does not support setting those
ciphers.  For now, just document this.  In the future, support for
this might be added somehow.

Reviewed-by: Jonathan S. Katz <jkatz@postgresql.org>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
---
 doc/src/sgml/config.sgml | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index ca6a3a523ff68..6ce59078967ce 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1216,16 +1216,22 @@ include_dir 'conf.d'
       </term>
       <listitem>
        <para>
-        Specifies a list of <acronym>SSL</acronym> cipher suites that are allowed to be
-        used on secure connections.  See
-        the <citerefentry><refentrytitle>ciphers</refentrytitle></citerefentry> manual page
-        in the <application>OpenSSL</application> package for the syntax of this setting
-        and a list of supported values.
-        This parameter can only be set in the <filename>postgresql.conf</filename>
-        file or on the server command line.
-        The default value is <literal>HIGH:MEDIUM:+3DES:!aNULL</literal>.  The
-        default is usually a reasonable choice unless you have specific
-        security requirements.
+        Specifies a list of <acronym>SSL</acronym> cipher suites that are
+        allowed to be used by SSL connections.  See the
+        <citerefentry><refentrytitle>ciphers</refentrytitle></citerefentry>
+        manual page in the <application>OpenSSL</application> package for the
+        syntax of this setting and a list of supported values.  Only
+        connections using TLS version 1.2 and lower are affected.  There is
+        currently no setting that controls the cipher choices used by TLS
+        version 1.3 connections.  The default value is
+        <literal>HIGH:MEDIUM:+3DES:!aNULL</literal>.  The default is usually a
+        reasonable choice unless you have specific security requirements.
+       </para>
+
+       <para>
+        This parameter can only be set in the
+        <filename>postgresql.conf</filename> file or on the server command
+        line.
        </para>
 
        <para>

From b9b610577d7f70d959968c3697557011699b3a54 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Jul 2020 17:19:37 -0400
Subject: [PATCH 06/54] Fix ancient violation of zlib's API spec.

contrib/pgcrypto mishandled the case where deflate() does not consume
all of the offered input on the first try.  It reset the next_in pointer
to the start of the input instead of leaving it alone, causing the wrong
data to be fed to the next deflate() call.

This has been broken since pgcrypto was committed.  The reason for the
lack of complaints seems to be that it's fairly hard to get stock zlib
to not consume all the input, so long as the output buffer is big enough
(which it normally would be in pgcrypto's usage; AFAICT the input is
always going to be packetized into packets no larger than ZIP_OUT_BUF).
However, IBM's zlibNX implementation for AIX evidently will do it
in some cases.

I did not add a test case for this, because I couldn't find one that
would fail with stock zlib.  When we put back the test case for
bug #16476, that will cover the zlibNX situation well enough.

While here, write deflate()'s second argument as Z_NO_FLUSH per its
API spec, instead of hard-wiring the value zero.

Per buildfarm results and subsequent investigation.

Discussion: https://postgr.es/m/16476-692ef7b84e5fb893@postgresql.org
---
 contrib/pgcrypto/pgp-compress.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c
index 0505bdee9237f..4b1d2a1ff5f96 100644
--- a/contrib/pgcrypto/pgp-compress.c
+++ b/contrib/pgcrypto/pgp-compress.c
@@ -114,13 +114,13 @@ compress_process(PushFilter *next, void *priv, const uint8 *data, int len)
 	/*
 	 * process data
 	 */
-	while (len > 0)
+	st->stream.next_in = unconstify(uint8 *, data);
+	st->stream.avail_in = len;
+	while (st->stream.avail_in > 0)
 	{
-		st->stream.next_in = unconstify(uint8 *, data);
-		st->stream.avail_in = len;
 		st->stream.next_out = st->buf;
 		st->stream.avail_out = st->buf_len;
-		res = deflate(&st->stream, 0);
+		res = deflate(&st->stream, Z_NO_FLUSH);
 		if (res != Z_OK)
 			return PXE_PGP_COMPRESSION_ERROR;
 
@@ -131,7 +131,6 @@ compress_process(PushFilter *next, void *priv, const uint8 *data, int len)
 			if (res < 0)
 				return res;
 		}
-		len = st->stream.avail_in;
 	}
 
 	return 0;
@@ -154,6 +153,7 @@ compress_flush(PushFilter *next, void *priv)
 		zres = deflate(&st->stream, Z_FINISH);
 		if (zres != Z_STREAM_END && zres != Z_OK)
 			return PXE_PGP_COMPRESSION_ERROR;
+
 		n_out = st->buf_len - st->stream.avail_out;
 		if (n_out > 0)
 		{

From 25244b8972a34b838c4033fe9efc1d31cba9d0e3 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 24 Jul 2020 10:34:16 +0200
Subject: [PATCH 07/54] Rename configure.in to configure.ac

The new name has been preferred by Autoconf for a long time.  Future
versions of Autoconf will warn about the old name.

Discussion: https://www.postgresql.org/message-id/flat/e796c185-5ece-8569-248f-dd3799701be1%402ndquadrant.com
---
 config/general.m4            |  2 +-
 configure.in => configure.ac |  4 ++--
 src/backend/catalog/Makefile |  4 ++--
 src/include/pg_config.h.in   |  2 +-
 src/tools/msvc/Solution.pm   | 10 +++++-----
 src/tools/version_stamp.pl   | 12 ++++++------
 6 files changed, 17 insertions(+), 17 deletions(-)
 rename configure.in => configure.ac (99%)

diff --git a/config/general.m4 b/config/general.m4
index 95d65ceb093d1..140b9737bfbc7 100644
--- a/config/general.m4
+++ b/config/general.m4
@@ -8,7 +8,7 @@
 # argument (other than "yes/no"), etc.
 #
 # The point of this implementation is to reduce code size and
-# redundancy in configure.in and to improve robustness and consistency
+# redundancy in configure.ac and to improve robustness and consistency
 # in the option evaluation code.
 
 
diff --git a/configure.in b/configure.ac
similarity index 99%
rename from configure.in
rename to configure.ac
index e91e49a579ee9..eb2c731b58fb5 100644
--- a/configure.in
+++ b/configure.ac
@@ -1,5 +1,5 @@
 dnl Process this file with autoconf to produce a configure script.
-dnl configure.in
+dnl configure.ac
 dnl
 dnl Developers, please strive to achieve this order:
 dnl
@@ -21,7 +21,7 @@ AC_INIT([PostgreSQL], [14devel], [pgsql-bugs@lists.postgresql.org], [], [https:/
 
 m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
 Untested combinations of 'autoconf' and PostgreSQL versions are not
-recommended.  You can remove the check from 'configure.in' but it is then
+recommended.  You can remove the check from 'configure.ac' but it is then
 your responsibility whether the result works or not.])])
 AC_COPYRIGHT([Copyright (c) 1996-2020, PostgreSQL Global Development Group])
 AC_CONFIG_SRCDIR([src/backend/access/common/heaptuple.c])
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index 9499bb33e566e..93cf6d4368579 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -103,10 +103,10 @@ generated-header-symlinks: $(top_builddir)/src/include/catalog/header-stamp
 # won't update them if they didn't change (to avoid unnecessary recompiles).
 # Technically, this should depend on Makefile.global which supplies
 # $(MAJORVERSION); but then genbki.pl would need to be re-run after every
-# configure run, even in distribution tarballs.  So depending on configure.in
+# configure run, even in distribution tarballs.  So depending on configure.ac
 # instead is cheating a bit, but it will achieve the goal of updating the
 # version number when it changes.
-bki-stamp: genbki.pl Catalog.pm $(POSTGRES_BKI_SRCS) $(POSTGRES_BKI_DATA) $(top_srcdir)/configure.in
+bki-stamp: genbki.pl Catalog.pm $(POSTGRES_BKI_SRCS) $(POSTGRES_BKI_DATA) $(top_srcdir)/configure.ac
 	$(PERL) $< --include-path=$(top_srcdir)/src/include/ \
 		--set-version=$(MAJORVERSION) $(POSTGRES_BKI_SRCS)
 	touch $@
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 73aa618166942..fb270df678a52 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -1,4 +1,4 @@
-/* src/include/pg_config.h.in.  Generated from configure.in by autoheader.  */
+/* src/include/pg_config.h.in.  Generated from configure.ac by autoheader.  */
 
 /* Define to the type of arg 1 of 'accept' */
 #undef ACCEPT_TYPE_ARG1
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 023da623826fc..bc8904732f017 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -155,9 +155,9 @@ sub GenerateFiles
 	my $ac_define_openssl_api_compat_found = 0;
 	my $openssl_api_compat;
 
-	# Parse configure.in to get version numbers
-	open(my $c, '<', "configure.in")
-	  || confess("Could not open configure.in for reading\n");
+	# Parse configure.ac to get version numbers
+	open(my $c, '<', "configure.ac")
+	  || confess("Could not open configure.ac for reading\n");
 	while (<$c>)
 	{
 		if (/^AC_INIT\(\[([^\]]+)\], \[([^\]]+)\], \[([^\]]+)\], \[([^\]]*)\], \[([^\]]+)\]/
@@ -185,7 +185,7 @@ sub GenerateFiles
 		}
 	}
 	close($c);
-	confess "Unable to parse configure.in for all variables!"
+	confess "Unable to parse configure.ac for all variables!"
 	  unless $ac_init_found && $ac_define_openssl_api_compat_found;
 
 	if (IsNewer("src/include/pg_config_os.h", "src/include/port/win32.h"))
@@ -834,7 +834,7 @@ EOF
 
 # Read lines from input file and substitute symbols using the same
 # logic that config.status uses.  There should be one call of this for
-# each AC_CONFIG_HEADERS call in configure.in.
+# each AC_CONFIG_HEADERS call in configure.ac.
 #
 # If the "required" argument is true, we also keep track which of our
 # defines have been found and error out if any are left unused at the
diff --git a/src/tools/version_stamp.pl b/src/tools/version_stamp.pl
index 3595587622253..36b18d514cf8d 100755
--- a/src/tools/version_stamp.pl
+++ b/src/tools/version_stamp.pl
@@ -9,10 +9,10 @@
 #################################################################
 
 #
-# This script updates the version stamp in configure.in, and also in assorted
+# This script updates the version stamp in configure.ac, and also in assorted
 # other files wherein it's not convenient to obtain the version number from
 # configure's output.  Note that you still have to run autoconf afterward
-# to regenerate configure from the updated configure.in.
+# to regenerate configure from the updated configure.ac.
 #
 # Usage: cd to top of source tree and issue
 #	src/tools/version_stamp.pl MINORVERSION
@@ -74,7 +74,7 @@
 # (this also ensures we're in the right directory)
 
 my $aconfver = "";
-open(my $fh, '<', "configure.in") || die "could not read configure.in: $!\n";
+open(my $fh, '<', "configure.ac") || die "could not read configure.ac: $!\n";
 while (<$fh>)
 {
 	if (m/^m4_if\(m4_defn\(\[m4_PACKAGE_VERSION\]\), \[(.*)\], \[\], \[m4_fatal/
@@ -86,13 +86,13 @@
 }
 close($fh);
 $aconfver ne ""
-  || die "could not find autoconf version number in configure.in\n";
+  || die "could not find autoconf version number in configure.ac\n";
 
-# Update configure.in and other files that contain version numbers
+# Update configure.ac and other files that contain version numbers
 
 my $fixedfiles = "";
 
-sed_file("configure.in",
+sed_file("configure.ac",
 	"-e 's/AC_INIT(\\[PostgreSQL\\], \\[[0-9a-z.]*\\]/AC_INIT([PostgreSQL], [$fullversion]/'"
 );
 

From 2f2007fbb255be178aca586780967f43885203a7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Jul 2020 15:26:51 -0400
Subject: [PATCH 08/54] Fix assorted bugs by changing TS_execute's callback API
 to ternary logic.

Text search sometimes failed to find valid matches, for instance
'!crew:A'::tsquery might fail to locate 'crew:1B'::tsvector during
an index search.  The root of the issue is that TS_execute's callback
functions were not changed to use ternary (yes/no/maybe) reporting
when we made the search logic itself do so.  It's somewhat annoying
to break that API, but on the other hand we now see that any code
using plain boolean logic is almost certainly broken since the
addition of phrase search.  There seem to be very few outside callers
of this code anyway, so we'll just break them intentionally to get
them to adapt.

This allows removal of tsginidx.c's private re-implementation of
TS_execute, since that's now entirely duplicative.  It's also no
longer necessary to avoid use of CALC_NOT in tsgistidx.c, since
the underlying callbacks can now do something reasonable.

Back-patch into v13.  We can't change this in stable branches,
but it seems not quite too late to fix it in v13.

Tom Lane and Pavel Borisov

Discussion: https://postgr.es/m/CALT9ZEE-aLotzBg-pOp2GFTesGWVYzXA3=mZKzRDa_OKnLF7Mg@mail.gmail.com
---
 src/backend/tsearch/wparser_def.c     |   8 +-
 src/backend/utils/adt/tsginidx.c      | 133 +++++------------------
 src/backend/utils/adt/tsgistidx.c     |  26 +++--
 src/backend/utils/adt/tsrank.c        |  12 ++-
 src/backend/utils/adt/tsvector_op.c   | 147 +++++++++++++++++---------
 src/include/tsearch/ts_utils.h        |  32 +++---
 src/test/regress/expected/tsearch.out | 144 +++++++++++++++++++++++++
 src/test/regress/expected/tstypes.out |  49 +++++++++
 src/test/regress/sql/tsearch.sql      |  24 +++++
 src/test/regress/sql/tstypes.sql      |   9 ++
 10 files changed, 395 insertions(+), 189 deletions(-)

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index fda35abc74179..76b6f9aef03ca 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1962,7 +1962,7 @@ typedef struct
 /*
  * TS_execute callback for matching a tsquery operand to headline words
  */
-static bool
+static TSTernaryValue
 checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 {
 	hlCheck    *checkval = (hlCheck *) opaque;
@@ -1975,7 +1975,7 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 		{
 			/* if data == NULL, don't need to report positions */
 			if (!data)
-				return true;
+				return TS_YES;
 
 			if (!data->pos)
 			{
@@ -1992,9 +1992,9 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 	}
 
 	if (data && data->npos > 0)
-		return true;
+		return TS_YES;
 
-	return false;
+	return TS_NO;
 }
 
 /*
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 2d656168fca56..3128f0a7da074 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -178,9 +178,13 @@ typedef struct
 	bool	   *need_recheck;
 } GinChkVal;
 
-static GinTernaryValue
-checkcondition_gin_internal(GinChkVal *gcv, QueryOperand *val, ExecPhraseData *data)
+/*
+ * TS_execute callback for matching a tsquery operand to GIN index data
+ */
+static TSTernaryValue
+checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
+	GinChkVal  *gcv = (GinChkVal *) checkval;
 	int			j;
 
 	/*
@@ -193,112 +197,22 @@ checkcondition_gin_internal(GinChkVal *gcv, QueryOperand *val, ExecPhraseData *d
 	/* convert item's number to corresponding entry's (operand's) number */
 	j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item];
 
-	/* return presence of current entry in indexed value */
-	return gcv->check[j];
-}
-
-/*
- * Wrapper of check condition function for TS_execute.
- */
-static bool
-checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
-{
-	return checkcondition_gin_internal((GinChkVal *) checkval,
-									   val,
-									   data) != GIN_FALSE;
-}
-
-/*
- * Evaluate tsquery boolean expression using ternary logic.
- *
- * Note: the reason we can't use TS_execute() for this is that its API
- * for the checkcondition callback doesn't allow a MAYBE result to be
- * returned, but we might have MAYBEs in the gcv->check array.
- * Perhaps we should change that API.
- */
-static GinTernaryValue
-TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem, bool in_phrase)
-{
-	GinTernaryValue val1,
-				val2,
-				result;
-
-	/* since this function recurses, it could be driven to stack overflow */
-	check_stack_depth();
-
-	if (curitem->type == QI_VAL)
-		return
-			checkcondition_gin_internal(gcv,
-										(QueryOperand *) curitem,
-										NULL /* don't have position info */ );
-
-	switch (curitem->qoperator.oper)
+	/*
+	 * return presence of current entry in indexed value; but TRUE becomes
+	 * MAYBE in the presence of a query requiring recheck
+	 */
+	if (gcv->check[j] == GIN_TRUE)
 	{
-		case OP_NOT:
-
-			/*
-			 * Below a phrase search, force NOT's result to MAYBE.  We cannot
-			 * invert a TRUE result from the subexpression to FALSE, since
-			 * TRUE only says that the subexpression matches somewhere, not
-			 * that it matches everywhere, so there might be positions where
-			 * the NOT will match.  We could invert FALSE to TRUE, but there's
-			 * little point in distinguishing TRUE from MAYBE, since a recheck
-			 * will have been forced already.
-			 */
-			if (in_phrase)
-				return GIN_MAYBE;
-
-			result = TS_execute_ternary(gcv, curitem + 1, in_phrase);
-			if (result == GIN_MAYBE)
-				return result;
-			return !result;
-
-		case OP_PHRASE:
-
-			/*
-			 * GIN doesn't contain any information about positions, so treat
-			 * OP_PHRASE as OP_AND with recheck requirement, and always
-			 * reporting MAYBE not TRUE.
-			 */
-			*(gcv->need_recheck) = true;
-			/* Pass down in_phrase == true in case there's a NOT below */
-			in_phrase = true;
-
-			/* FALL THRU */
-
-		case OP_AND:
-			val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left,
-									  in_phrase);
-			if (val1 == GIN_FALSE)
-				return GIN_FALSE;
-			val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase);
-			if (val2 == GIN_FALSE)
-				return GIN_FALSE;
-			if (val1 == GIN_TRUE && val2 == GIN_TRUE &&
-				curitem->qoperator.oper != OP_PHRASE)
-				return GIN_TRUE;
-			else
-				return GIN_MAYBE;
-
-		case OP_OR:
-			val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left,
-									  in_phrase);
-			if (val1 == GIN_TRUE)
-				return GIN_TRUE;
-			val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase);
-			if (val2 == GIN_TRUE)
-				return GIN_TRUE;
-			if (val1 == GIN_FALSE && val2 == GIN_FALSE)
-				return GIN_FALSE;
-			else
-				return GIN_MAYBE;
-
-		default:
-			elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
+		if (val->weight != 0 || data != NULL)
+			return TS_MAYBE;
 	}
 
-	/* not reachable, but keep compiler quiet */
-	return false;
+	/*
+	 * We rely on GinTernaryValue and TSTernaryValue using equivalent value
+	 * assignments.  We could use a switch statement to map the values if that
+	 * ever stops being true, but it seems unlikely to happen.
+	 */
+	return (TSTernaryValue) gcv->check[j];
 }
 
 Datum
@@ -370,10 +284,11 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS)
 		gcv.map_item_operand = (int *) (extra_data[0]);
 		gcv.need_recheck = &recheck;
 
-		res = TS_execute_ternary(&gcv, GETQUERY(query), false);
-
-		if (res == GIN_TRUE && recheck)
-			res = GIN_MAYBE;
+		if (TS_execute(GETQUERY(query),
+					   &gcv,
+					   TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS,
+					   checkcondition_gin))
+			res = recheck ? GIN_MAYBE : GIN_TRUE;
 	}
 
 	PG_RETURN_GIN_TERNARY_VALUE(res);
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index c3f25800e7b27..927aed9156498 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -273,9 +273,9 @@ typedef struct
 } CHKVAL;
 
 /*
- * is there value 'val' in array or not ?
+ * TS_execute callback for matching a tsquery operand to GIST leaf-page data
  */
-static bool
+static TSTernaryValue
 checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
 	int32	   *StopLow = ((CHKVAL *) checkval)->arrb;
@@ -288,23 +288,26 @@ checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data)
 	 * we are not able to find a prefix by hash value
 	 */
 	if (val->prefix)
-		return true;
+		return TS_MAYBE;
 
 	while (StopLow < StopHigh)
 	{
 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
 		if (*StopMiddle == val->valcrc)
-			return true;
+			return TS_MAYBE;
 		else if (*StopMiddle < val->valcrc)
 			StopLow = StopMiddle + 1;
 		else
 			StopHigh = StopMiddle;
 	}
 
-	return false;
+	return TS_NO;
 }
 
-static bool
+/*
+ * TS_execute callback for matching a tsquery operand to GIST non-leaf data
+ */
+static TSTernaryValue
 checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
 	void	   *key = (SignTSVector *) checkval;
@@ -313,8 +316,12 @@ checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data)
 	 * we are not able to find a prefix in signature tree
 	 */
 	if (val->prefix)
-		return true;
-	return GETBIT(GETSIGN(key), HASHVAL(val->valcrc, GETSIGLEN(key)));
+		return TS_MAYBE;
+
+	if (GETBIT(GETSIGN(key), HASHVAL(val->valcrc, GETSIGLEN(key))))
+		return TS_MAYBE;
+	else
+		return TS_NO;
 }
 
 Datum
@@ -339,10 +346,9 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
 		if (ISALLTRUE(key))
 			PG_RETURN_BOOL(true);
 
-		/* since signature is lossy, cannot specify CALC_NOT here */
 		PG_RETURN_BOOL(TS_execute(GETQUERY(query),
 								  key,
-								  TS_EXEC_PHRASE_NO_POS,
+								  TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT,
 								  checkcondition_bit));
 	}
 	else
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index 07251dd577c8f..cbd97abccde4f 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -556,14 +556,18 @@ typedef struct
 #define QR_GET_OPERAND_DATA(q, v) \
 	( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
 
-static bool
-checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
+/*
+ * TS_execute callback for matching a tsquery operand to QueryRepresentation
+ */
+static TSTernaryValue
+checkcondition_QueryOperand(void *checkval, QueryOperand *val,
+							ExecPhraseData *data)
 {
 	QueryRepresentation *qr = (QueryRepresentation *) checkval;
 	QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
 
 	if (!opData->operandexists)
-		return false;
+		return TS_NO;
 
 	if (data)
 	{
@@ -573,7 +577,7 @@ checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *d
 			data->pos += MAXQROPOS - opData->npos;
 	}
 
-	return true;
+	return TS_YES;
 }
 
 typedef struct
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 51619c396c74a..6df943abd4e1b 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -67,14 +67,6 @@ typedef struct
 	StatEntry  *root;
 } TSVectorStat;
 
-/* TS_execute requires ternary logic to handle NOT with phrase matches */
-typedef enum
-{
-	TS_NO,						/* definitely no match */
-	TS_YES,						/* definitely does match */
-	TS_MAYBE					/* can't verify match for lack of pos data */
-} TSTernaryValue;
-
 
 static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg,
 										 uint32 flags,
@@ -1188,13 +1180,15 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
 /*
  * Check weight info or/and fill 'data' with the required positions
  */
-static bool
+static TSTernaryValue
 checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 			   ExecPhraseData *data)
 {
-	bool		result = false;
+	TSTernaryValue result = TS_NO;
 
-	if (entry->haspos && (val->weight || data))
+	Assert(data == NULL || data->npos == 0);
+
+	if (entry->haspos)
 	{
 		WordEntryPosVector *posvec;
 
@@ -1232,7 +1226,13 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 			data->npos = dptr - data->pos;
 
 			if (data->npos > 0)
-				result = true;
+				result = TS_YES;
+			else
+			{
+				pfree(data->pos);
+				data->pos = NULL;
+				data->allocated = false;
+			}
 		}
 		else if (val->weight)
 		{
@@ -1243,40 +1243,57 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 			{
 				if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
 				{
-					result = true;
+					result = TS_YES;
 					break;		/* no need to go further */
 				}
 
 				posvec_iter++;
 			}
 		}
-		else					/* data != NULL */
+		else if (data)
 		{
 			data->npos = posvec->npos;
 			data->pos = posvec->pos;
 			data->allocated = false;
-			result = true;
+			result = TS_YES;
+		}
+		else
+		{
+			/* simplest case: no weight check, positions not needed */
+			result = TS_YES;
 		}
 	}
 	else
 	{
-		result = true;
+		/*
+		 * Position info is lacking, so if the caller requires it, we can only
+		 * say that maybe there is a match.
+		 *
+		 * Notice, however, that we *don't* check val->weight here.
+		 * Historically, stripped tsvectors are considered to match queries
+		 * whether or not the query has a weight restriction; that's a little
+		 * dubious but we'll preserve the behavior.
+		 */
+		if (data)
+			result = TS_MAYBE;
+		else
+			result = TS_YES;
 	}
 
 	return result;
 }
 
 /*
- * is there value 'val' in array or not ?
+ * TS_execute callback for matching a tsquery operand to plain tsvector data
  */
-static bool
+static TSTernaryValue
 checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
 	CHKVAL	   *chkval = (CHKVAL *) checkval;
 	WordEntry  *StopLow = chkval->arrb;
 	WordEntry  *StopHigh = chkval->arre;
 	WordEntry  *StopMiddle = StopHigh;
-	bool		res = false;
+	TSTernaryValue res = TS_NO;
 
 	/* Loop invariant: StopLow <= val < StopHigh */
 	while (StopLow < StopHigh)
@@ -1302,36 +1319,69 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 			StopHigh = StopMiddle;
 	}
 
-	if ((!res || data) && val->prefix)
+	/*
+	 * If it's a prefix search, we should also consider lexemes that the
+	 * search term is a prefix of (which will necessarily immediately follow
+	 * the place we found in the above loop).  But we can skip them if there
+	 * was a definite match on the exact term AND the caller doesn't need
+	 * position info.
+	 */
+	if (val->prefix && (res != TS_YES || data))
 	{
 		WordEntryPos *allpos = NULL;
 		int			npos = 0,
 					totalpos = 0;
 
-		/*
-		 * there was a failed exact search, so we should scan further to find
-		 * a prefix match. We also need to do so if caller needs position info
-		 */
+		/* adjust start position for corner case */
 		if (StopLow >= StopHigh)
 			StopMiddle = StopHigh;
 
-		while ((!res || data) && StopMiddle < chkval->arre &&
+		/* we don't try to re-use any data from the initial match */
+		if (data)
+		{
+			if (data->allocated)
+				pfree(data->pos);
+			data->pos = NULL;
+			data->allocated = false;
+			data->npos = 0;
+		}
+		res = TS_NO;
+
+		while ((res != TS_YES || data) &&
+			   StopMiddle < chkval->arre &&
 			   tsCompareString(chkval->operand + val->distance,
 							   val->length,
 							   chkval->values + StopMiddle->pos,
 							   StopMiddle->len,
 							   true) == 0)
 		{
-			if (data)
-			{
-				/*
-				 * We need to join position information
-				 */
-				res = checkclass_str(chkval, StopMiddle, val, data);
+			TSTernaryValue subres;
+
+			subres = checkclass_str(chkval, StopMiddle, val, data);
 
-				if (res)
+			if (subres != TS_NO)
+			{
+				if (data)
 				{
-					while (npos + data->npos >= totalpos)
+					/*
+					 * We need to join position information
+					 */
+					if (subres == TS_MAYBE)
+					{
+						/*
+						 * No position info for this match, so we must report
+						 * MAYBE overall.
+						 */
+						res = TS_MAYBE;
+						/* forget any previous positions */
+						npos = 0;
+						/* don't leak storage */
+						if (allpos)
+							pfree(allpos);
+						break;
+					}
+
+					while (npos + data->npos > totalpos)
 					{
 						if (totalpos == 0)
 						{
@@ -1347,22 +1397,27 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 
 					memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
 					npos += data->npos;
+
+					/* don't leak storage from individual matches */
+					if (data->allocated)
+						pfree(data->pos);
+					data->pos = NULL;
+					data->allocated = false;
+					/* it's important to reset data->npos before next loop */
+					data->npos = 0;
 				}
 				else
 				{
-					/* at loop exit, res must be true if we found matches */
-					res = (npos > 0);
+					/* Don't need positions, just handle YES/MAYBE */
+					if (subres == TS_YES || res == TS_NO)
+						res = subres;
 				}
 			}
-			else
-			{
-				res = checkclass_str(chkval, StopMiddle, val, NULL);
-			}
 
 			StopMiddle++;
 		}
 
-		if (res && data)
+		if (data && npos > 0)
 		{
 			/* Sort and make unique array of found positions */
 			data->pos = allpos;
@@ -1370,6 +1425,7 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 			data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
 								 compareWordEntryPos);
 			data->allocated = true;
+			res = TS_YES;
 		}
 	}
 
@@ -1561,14 +1617,7 @@ TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
 	check_stack_depth();
 
 	if (curitem->type == QI_VAL)
-	{
-		if (!chkcond(arg, (QueryOperand *) curitem, data))
-			return TS_NO;
-		if (data->npos > 0 || data->negate)
-			return TS_YES;
-		/* If we have no position data, we must return TS_MAYBE */
-		return TS_MAYBE;
-	}
+		return chkcond(arg, (QueryOperand *) curitem, data);
 
 	switch (curitem->qoperator.oper)
 	{
@@ -1821,7 +1870,7 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
 
 	if (curitem->type == QI_VAL)
 		return chkcond(arg, (QueryOperand *) curitem,
-					   NULL /* don't need position info */ ) ? TS_YES : TS_NO;
+					   NULL /* don't need position info */ );
 
 	switch (curitem->qoperator.oper)
 	{
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
index f78fbd9d1a4c6..609b0c7e9bdf9 100644
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -124,13 +124,21 @@ extern text *generateHeadline(HeadlineParsedText *prs);
  * whether a given primitive tsquery value is matched in the data.
  */
 
+/* TS_execute requires ternary logic to handle NOT with phrase matches */
+typedef enum
+{
+	TS_NO,						/* definitely no match */
+	TS_YES,						/* definitely does match */
+	TS_MAYBE					/* can't verify match for lack of pos data */
+} TSTernaryValue;
+
 /*
  * struct ExecPhraseData is passed to a TSExecuteCallback function if we need
  * lexeme position data (because of a phrase-match operator in the tsquery).
- * The callback should fill in position data when it returns true (success).
- * If it cannot return position data, it may leave "data" unchanged, but
- * then the caller of TS_execute() must pass the TS_EXEC_PHRASE_NO_POS flag
- * and must arrange for a later recheck with position data available.
+ * The callback should fill in position data when it returns TS_YES (success).
+ * If it cannot return position data, it should leave "data" unchanged and
+ * return TS_MAYBE.  The caller of TS_execute() must then arrange for a later
+ * recheck with position data available.
  *
  * The reported lexeme positions must be sorted and unique.  Callers must only
  * consult the position bits of the pos array, ie, WEP_GETPOS(data->pos[i]).
@@ -162,12 +170,13 @@ typedef struct ExecPhraseData
  * val: lexeme to test for presence of
  * data: to be filled with lexeme positions; NULL if position data not needed
  *
- * Return true if lexeme is present in data, else false.  If data is not
- * NULL, it should be filled with lexeme positions, but function can leave
- * it as zeroes if position data is not available.
+ * Return TS_YES if lexeme is present in data, TS_MAYBE if it might be
+ * present, TS_NO if it definitely is not present.  If data is not NULL,
+ * it must be filled with lexeme positions if available.  If position data
+ * is not available, leave *data as zeroes and return TS_MAYBE, never TS_YES.
  */
-typedef bool (*TSExecuteCallback) (void *arg, QueryOperand *val,
-								   ExecPhraseData *data);
+typedef TSTernaryValue (*TSExecuteCallback) (void *arg, QueryOperand *val,
+											 ExecPhraseData *data);
 
 /*
  * Flag bits for TS_execute
@@ -175,10 +184,7 @@ typedef bool (*TSExecuteCallback) (void *arg, QueryOperand *val,
 #define TS_EXEC_EMPTY			(0x00)
 /*
  * If TS_EXEC_CALC_NOT is not set, then NOT expressions are automatically
- * evaluated to be true.  Useful in cases where NOT cannot be accurately
- * computed (GiST) or it isn't important (ranking).  From TS_execute's
- * perspective, !CALC_NOT means that the TSExecuteCallback function might
- * return false-positive indications of a lexeme's presence.
+ * evaluated to be true.  Useful in cases where NOT isn't important (ranking).
  */
 #define TS_EXEC_CALC_NOT		(0x01)
 /*
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
index 7105c67cf8f88..0110b4d2e0d8d 100644
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -176,6 +176,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 create index wowidx on test_tsvector using gist (a);
 SET enable_seqscan=OFF;
 SET enable_indexscan=ON;
@@ -308,6 +332,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 SET enable_indexscan=OFF;
 SET enable_bitmapscan=ON;
 explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
@@ -440,6 +488,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 -- Test siglen parameter of GiST tsvector_ops
 CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(foo=1));
 ERROR:  unrecognized parameter "foo"
@@ -595,6 +667,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 DROP INDEX wowidx2;
 CREATE INDEX wowidx ON test_tsvector USING gist (a tsvector_ops(siglen=484));
 \d test_tsvector
@@ -736,6 +832,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 RESET enable_seqscan;
 RESET enable_indexscan;
 RESET enable_bitmapscan;
@@ -873,6 +993,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 -- Test optimization of non-empty GIN_SEARCH_MODE_ALL queries
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM test_tsvector WHERE a @@ '!qh';
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index ee4a2490bbb34..2601e312df4eb 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -551,6 +551,55 @@ SELECT 'wa:1A wb:2D'::tsvector @@ 'w:*D <-> w:*A'::tsquery as "false";
  f
 (1 row)
 
+SELECT 'wa:1A'::tsvector @@ 'w:*A'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'wa:1A'::tsvector @@ 'w:*D'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'wa:1A'::tsvector @@ '!w:*A'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'wa:1A'::tsvector @@ '!w:*D'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+-- historically, a stripped tsvector matches queries ignoring weights:
+SELECT strip('wa:1A'::tsvector) @@ 'w:*A'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT strip('wa:1A'::tsvector) @@ 'w:*D'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT strip('wa:1A'::tsvector) @@ '!w:*A'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT strip('wa:1A'::tsvector) @@ '!w:*D'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
 SELECT 'supernova'::tsvector @@ 'super'::tsquery AS "false";
  false 
 -------
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
index e53e44f7ed1c3..8a27fcd8b0b86 100644
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -61,6 +61,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 create index wowidx on test_tsvector using gist (a);
 
@@ -90,6 +94,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 SET enable_indexscan=OFF;
 SET enable_bitmapscan=ON;
@@ -116,6 +124,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 -- Test siglen parameter of GiST tsvector_ops
 CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(foo=1));
@@ -152,6 +164,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 DROP INDEX wowidx2;
 
@@ -181,6 +197,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 RESET enable_seqscan;
 RESET enable_indexscan;
@@ -215,6 +235,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 -- Test optimization of non-empty GIN_SEARCH_MODE_ALL queries
 EXPLAIN (COSTS OFF)
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index 50b4359c9a401..30c8c702f0917 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -104,6 +104,15 @@ SELECT 'a b:89  ca:23A,64c cb:80b d:34c'::tsvector @@ 'd:AC & c:*B' as "true";
 SELECT 'wa:1D wb:2A'::tsvector @@ 'w:*D & w:*A'::tsquery as "true";
 SELECT 'wa:1D wb:2A'::tsvector @@ 'w:*D <-> w:*A'::tsquery as "true";
 SELECT 'wa:1A wb:2D'::tsvector @@ 'w:*D <-> w:*A'::tsquery as "false";
+SELECT 'wa:1A'::tsvector @@ 'w:*A'::tsquery as "true";
+SELECT 'wa:1A'::tsvector @@ 'w:*D'::tsquery as "false";
+SELECT 'wa:1A'::tsvector @@ '!w:*A'::tsquery as "false";
+SELECT 'wa:1A'::tsvector @@ '!w:*D'::tsquery as "true";
+-- historically, a stripped tsvector matches queries ignoring weights:
+SELECT strip('wa:1A'::tsvector) @@ 'w:*A'::tsquery as "true";
+SELECT strip('wa:1A'::tsvector) @@ 'w:*D'::tsquery as "true";
+SELECT strip('wa:1A'::tsvector) @@ '!w:*A'::tsquery as "false";
+SELECT strip('wa:1A'::tsvector) @@ '!w:*D'::tsquery as "false";
 
 SELECT 'supernova'::tsvector @@ 'super'::tsquery AS "false";
 SELECT 'supeanova supernova'::tsvector @@ 'super'::tsquery AS "false";

From 79d6d1a277ee1cdda90f9a66d7970ac3885822de Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Jul 2020 15:43:56 -0400
Subject: [PATCH 09/54] Replace TS_execute's TS_EXEC_CALC_NOT flag with
 TS_EXEC_SKIP_NOT.

It's fairly silly that ignoring NOT subexpressions is TS_execute's
default behavior.  It's wrong on its face and it encourages errors
of omission.  Moreover, the only two remaining callers that aren't
specifying CALC_NOT are in ts_headline calculations, and it's very
arguable that those are bugs: if you've specified "!foo" in your
query, why would you want to get a headline that includes "foo"?

Hence, rip that out and change the default behavior to be to calculate
NOT accurately.  As a concession to the slim chance that there is still
somebody somewhere who needs the incorrect behavior, provide a new
SKIP_NOT flag to explicitly request that.

Back-patch into v13, mainly because it seems better to change this
at the same time as the previous commit's rejiggering of TS_execute
related APIs.  Any outside callers affected by this change are
probably also affected by that one.

Discussion: https://postgr.es/m/CALT9ZEE-aLotzBg-pOp2GFTesGWVYzXA3=mZKzRDa_OKnLF7Mg@mail.gmail.com
---
 src/backend/utils/adt/tsginidx.c    | 4 ++--
 src/backend/utils/adt/tsgistidx.c   | 4 ++--
 src/backend/utils/adt/tsrank.c      | 2 +-
 src/backend/utils/adt/tsvector_op.c | 8 ++++----
 src/include/tsearch/ts_utils.h      | 8 +++++---
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 3128f0a7da074..b3e3ffc577635 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -248,7 +248,7 @@ gin_tsquery_consistent(PG_FUNCTION_ARGS)
 
 		res = TS_execute(GETQUERY(query),
 						 &gcv,
-						 TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS,
+						 TS_EXEC_PHRASE_NO_POS,
 						 checkcondition_gin);
 	}
 
@@ -286,7 +286,7 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS)
 
 		if (TS_execute(GETQUERY(query),
 					   &gcv,
-					   TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS,
+					   TS_EXEC_PHRASE_NO_POS,
 					   checkcondition_gin))
 			res = recheck ? GIN_MAYBE : GIN_TRUE;
 	}
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 927aed9156498..a601965bd83e6 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -348,7 +348,7 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
 
 		PG_RETURN_BOOL(TS_execute(GETQUERY(query),
 								  key,
-								  TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT,
+								  TS_EXEC_PHRASE_NO_POS,
 								  checkcondition_bit));
 	}
 	else
@@ -359,7 +359,7 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
 		chkval.arre = chkval.arrb + ARRNELEM(key);
 		PG_RETURN_BOOL(TS_execute(GETQUERY(query),
 								  (void *) &chkval,
-								  TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT,
+								  TS_EXEC_PHRASE_NO_POS,
 								  checkcondition_arr));
 	}
 }
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index cbd97abccde4f..c88ebfc7d4113 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -697,7 +697,7 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
 		fillQueryRepresentationData(qr, ptr);
 
 		if (TS_execute(GETQUERY(qr->query), (void *) qr,
-					   TS_EXEC_CALC_NOT, checkcondition_QueryOperand))
+					   TS_EXEC_EMPTY, checkcondition_QueryOperand))
 		{
 			if (WEP_GETPOS(ptr->pos) < ext->p)
 			{
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 6df943abd4e1b..f01b1ee25377f 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1627,9 +1627,9 @@ TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
 			 * We need not touch data->width, since a NOT operation does not
 			 * change the match width.
 			 */
-			if (!(flags & TS_EXEC_CALC_NOT))
+			if (flags & TS_EXEC_SKIP_NOT)
 			{
-				/* without CALC_NOT, report NOT as "match everywhere" */
+				/* with SKIP_NOT, report NOT as "match everywhere" */
 				Assert(data->npos == 0 && !data->negate);
 				data->negate = true;
 				return TS_YES;
@@ -1875,7 +1875,7 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
 	switch (curitem->qoperator.oper)
 	{
 		case OP_NOT:
-			if (!(flags & TS_EXEC_CALC_NOT))
+			if (flags & TS_EXEC_SKIP_NOT)
 				return TS_YES;
 			switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
 			{
@@ -2038,7 +2038,7 @@ ts_match_vq(PG_FUNCTION_ARGS)
 	chkval.operand = GETOPERAND(query);
 	result = TS_execute(GETQUERY(query),
 						&chkval,
-						TS_EXEC_CALC_NOT,
+						TS_EXEC_EMPTY,
 						checkcondition_str);
 
 	PG_FREE_IF_COPY(val, 0);
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
index 609b0c7e9bdf9..400ba33001484 100644
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -183,10 +183,12 @@ typedef TSTernaryValue (*TSExecuteCallback) (void *arg, QueryOperand *val,
  */
 #define TS_EXEC_EMPTY			(0x00)
 /*
- * If TS_EXEC_CALC_NOT is not set, then NOT expressions are automatically
- * evaluated to be true.  Useful in cases where NOT isn't important (ranking).
+ * If TS_EXEC_SKIP_NOT is set, then NOT sub-expressions are automatically
+ * evaluated to be true.  This was formerly the default behavior.  It's now
+ * deprecated because it tends to give silly answers, but some applications
+ * might still have a use for it.
  */
-#define TS_EXEC_CALC_NOT		(0x01)
+#define TS_EXEC_SKIP_NOT		(0x01)
 /*
  * If TS_EXEC_PHRASE_NO_POS is set, allow OP_PHRASE to be executed lossily
  * in the absence of position information: a true result indicates that the

From 2a2494229a709b880a6db82d8b267017fccf671f Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 25 Jul 2020 10:20:39 +0530
Subject: [PATCH 10/54] Fix buffer usage stats for nodes above Gather Merge.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 85c9d347 addressed a similar problem for Gather and Gather
Merge nodes but forgot to account for nodes above parallel nodes.  This
still works for nodes above Gather node because we shut down the workers
for Gather node as soon as there are no more tuples.  We can do a similar
thing for Gather Merge as well but it seems better to account for stats
during nodes shutdown after completing the execution.

Reported-by: Stéphane Lorek, Jehan-Guillaume de Rorthais
Author: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Reviewed-by: Amit Kapila
Backpatch-through: 10, where it was introduced
Discussion: https://postgr.es/m/20200718160206.584532a2@firost
---
 src/backend/executor/execProcnode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 5662e7d74215e..01b7b926bf705 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -755,8 +755,6 @@ ExecShutdownNode(PlanState *node)
 
 	check_stack_depth();
 
-	planstate_tree_walker(node, ExecShutdownNode, NULL);
-
 	/*
 	 * Treat the node as running while we shut it down, but only if it's run
 	 * at least once already.  We don't expect much CPU consumption during
@@ -770,6 +768,8 @@ ExecShutdownNode(PlanState *node)
 	if (node->instrument && node->instrument->running)
 		InstrStartNode(node->instrument);
 
+	planstate_tree_walker(node, ExecShutdownNode, NULL);
+
 	switch (nodeTag(node))
 	{
 		case T_GatherState:

From 8a37951eebffd9bf528cb06d46127fb721d0e452 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 25 Jul 2020 12:54:58 -0400
Subject: [PATCH 11/54] Mark built-in coercion functions as leakproof where
 possible.

Making these leakproof seems helpful since (for example) if you have a
function f(int8) that is leakproof, you don't want it to effectively
become non-leakproof when you apply it to an int4 or int2 column.
But that's what happens today, since the implicit up-coercion will
not be leakproof.

Most of the coercion functions that visibly can't throw errors are
functions that convert numeric datatypes to other, wider ones.
Notable is that float4_numeric and float8_numeric can be marked
leakproof; before commit a57d312a7 they could not have been.
I also marked the functions that coerce strings to "name" as leakproof;
that's okay today because they truncate silently, but if we ever
reconsidered that behavior then they could no longer be leakproof.

I desisted from marking rtrim1() as leakproof; it appears so right now,
but the code seems a little too complex and perhaps subject to change,
since it's shared with other SQL functions.

Discussion: https://postgr.es/m/459322.1595607431@sss.pgh.pa.us
---
 src/include/catalog/catversion.h         |   2 +-
 src/include/catalog/pg_proc.dat          | 103 ++++++++++++-----------
 src/test/regress/expected/opr_sanity.out |  25 ++++++
 3 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index ed3aef93d04da..928495112196a 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007202
+#define CATALOG_VERSION_NO	202007251
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 4b5af32440fbc..082a11f2708c6 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -698,11 +698,11 @@
   proname => 'dlog1', prorettype => 'float8', proargtypes => 'float8',
   prosrc => 'dlog1' },
 { oid => '235', descr => 'convert int2 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'int2',
-  prosrc => 'i2tod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'int2', prosrc => 'i2tod' },
 { oid => '236', descr => 'convert int2 to float4',
-  proname => 'float4', prorettype => 'float4', proargtypes => 'int2',
-  prosrc => 'i2tof' },
+  proname => 'float4', proleakproof => 't', prorettype => 'float4',
+  proargtypes => 'int2', prosrc => 'i2tof' },
 { oid => '237', descr => 'convert float8 to int2',
   proname => 'int2', prorettype => 'int2', proargtypes => 'float8',
   prosrc => 'dtoi2' },
@@ -879,26 +879,26 @@
   proargtypes => 'float8 float8 float8 int4', prosrc => 'width_bucket_float8' },
 
 { oid => '311', descr => 'convert float4 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'float4',
-  prosrc => 'ftod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'float4', prosrc => 'ftod' },
 { oid => '312', descr => 'convert float8 to float4',
   proname => 'float4', prorettype => 'float4', proargtypes => 'float8',
   prosrc => 'dtof' },
 { oid => '313', descr => 'convert int2 to int4',
-  proname => 'int4', prorettype => 'int4', proargtypes => 'int2',
-  prosrc => 'i2toi4' },
+  proname => 'int4', proleakproof => 't', prorettype => 'int4',
+  proargtypes => 'int2', prosrc => 'i2toi4' },
 { oid => '314', descr => 'convert int4 to int2',
   proname => 'int2', prorettype => 'int2', proargtypes => 'int4',
   prosrc => 'i4toi2' },
 { oid => '316', descr => 'convert int4 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'int4',
-  prosrc => 'i4tod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'int4', prosrc => 'i4tod' },
 { oid => '317', descr => 'convert float8 to int4',
   proname => 'int4', prorettype => 'int4', proargtypes => 'float8',
   prosrc => 'dtoi4' },
 { oid => '318', descr => 'convert int4 to float4',
-  proname => 'float4', prorettype => 'float4', proargtypes => 'int4',
-  prosrc => 'i4tof' },
+  proname => 'float4', proleakproof => 't', prorettype => 'float4',
+  proargtypes => 'int4', prosrc => 'i4tof' },
 { oid => '319', descr => 'convert float4 to int4',
   proname => 'int4', prorettype => 'int4', proargtypes => 'float4',
   prosrc => 'ftoi4' },
@@ -1150,17 +1150,17 @@
   proname => 'text', prorettype => 'text', proargtypes => 'bpchar',
   prosrc => 'rtrim1' },
 { oid => '406', descr => 'convert name to text',
-  proname => 'text', prorettype => 'text', proargtypes => 'name',
-  prosrc => 'name_text' },
+  proname => 'text', proleakproof => 't', prorettype => 'text',
+  proargtypes => 'name', prosrc => 'name_text' },
 { oid => '407', descr => 'convert text to name',
-  proname => 'name', prorettype => 'name', proargtypes => 'text',
-  prosrc => 'text_name' },
+  proname => 'name', proleakproof => 't', prorettype => 'name',
+  proargtypes => 'text', prosrc => 'text_name' },
 { oid => '408', descr => 'convert name to char(n)',
   proname => 'bpchar', prorettype => 'bpchar', proargtypes => 'name',
   prosrc => 'name_bpchar' },
 { oid => '409', descr => 'convert char(n) to name',
-  proname => 'name', prorettype => 'name', proargtypes => 'bpchar',
-  prosrc => 'bpchar_name' },
+  proname => 'name', proleakproof => 't', prorettype => 'name',
+  proargtypes => 'bpchar', prosrc => 'bpchar_name' },
 
 { oid => '449', descr => 'hash',
   proname => 'hashint2', prorettype => 'int4', proargtypes => 'int2',
@@ -1338,11 +1338,11 @@
   proname => 'int4', prorettype => 'int4', proargtypes => 'int8',
   prosrc => 'int84' },
 { oid => '481', descr => 'convert int4 to int8',
-  proname => 'int8', prorettype => 'int8', proargtypes => 'int4',
-  prosrc => 'int48' },
+  proname => 'int8', proleakproof => 't', prorettype => 'int8',
+  proargtypes => 'int4', prosrc => 'int48' },
 { oid => '482', descr => 'convert int8 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'int8',
-  prosrc => 'i8tod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'int8', prosrc => 'i8tod' },
 { oid => '483', descr => 'convert float8 to int8',
   proname => 'int8', prorettype => 'int8', proargtypes => 'float8',
   prosrc => 'dtoi8' },
@@ -1359,8 +1359,8 @@
   proargtypes => 'anyarray int8', prosrc => 'hash_array_extended' },
 
 { oid => '652', descr => 'convert int8 to float4',
-  proname => 'float4', prorettype => 'float4', proargtypes => 'int8',
-  prosrc => 'i8tof' },
+  proname => 'float4', proleakproof => 't', prorettype => 'float4',
+  proargtypes => 'int8', prosrc => 'i8tof' },
 { oid => '653', descr => 'convert float4 to int8',
   proname => 'int8', prorettype => 'int8', proargtypes => 'float4',
   prosrc => 'ftoi8' },
@@ -1369,8 +1369,8 @@
   proname => 'int2', prorettype => 'int2', proargtypes => 'int8',
   prosrc => 'int82' },
 { oid => '754', descr => 'convert int2 to int8',
-  proname => 'int8', prorettype => 'int8', proargtypes => 'int2',
-  prosrc => 'int28' },
+  proname => 'int8', proleakproof => 't', prorettype => 'int8',
+  proargtypes => 'int2', prosrc => 'int28' },
 
 { oid => '655',
   proname => 'namelt', proleakproof => 't', prorettype => 'bool',
@@ -2521,8 +2521,8 @@
   proname => 'oid', prorettype => 'oid', proargtypes => 'int8',
   prosrc => 'i8tooid' },
 { oid => '1288', descr => 'convert oid to int8',
-  proname => 'int8', prorettype => 'int8', proargtypes => 'oid',
-  prosrc => 'oidtoi8' },
+  proname => 'int8', proleakproof => 't', prorettype => 'int8',
+  proargtypes => 'oid', prosrc => 'oidtoi8' },
 
 { oid => '1291',
   descr => 'trigger to suppress updates when new and old records match',
@@ -2782,8 +2782,8 @@
   prosrc => 'textlen' },
 
 { oid => '1370', descr => 'convert time to interval',
-  proname => 'interval', prorettype => 'interval', proargtypes => 'time',
-  prosrc => 'time_interval' },
+  proname => 'interval', proleakproof => 't', prorettype => 'interval',
+  proargtypes => 'time', prosrc => 'time_interval' },
 { oid => '1372', descr => 'character length',
   proname => 'char_length', prorettype => 'int4', proargtypes => 'bpchar',
   prosrc => 'bpcharlen' },
@@ -2861,11 +2861,11 @@
 # OIDS 1400 - 1499
 
 { oid => '1400', descr => 'convert varchar to name',
-  proname => 'name', prorettype => 'name', proargtypes => 'varchar',
-  prosrc => 'text_name' },
+  proname => 'name', proleakproof => 't', prorettype => 'name',
+  proargtypes => 'varchar', prosrc => 'text_name' },
 { oid => '1401', descr => 'convert name to varchar',
-  proname => 'varchar', prorettype => 'varchar', proargtypes => 'name',
-  prosrc => 'name_text' },
+  proname => 'varchar', proleakproof => 't', prorettype => 'varchar',
+  proargtypes => 'name', prosrc => 'name_text' },
 
 { oid => '1402', descr => 'current schema name',
   proname => 'current_schema', provolatile => 's', proparallel => 'u',
@@ -3941,8 +3941,8 @@
   proname => 'macaddr8_or', prorettype => 'macaddr8',
   proargtypes => 'macaddr8 macaddr8', prosrc => 'macaddr8_or' },
 { oid => '4123', descr => 'convert macaddr to macaddr8',
-  proname => 'macaddr8', prorettype => 'macaddr8', proargtypes => 'macaddr',
-  prosrc => 'macaddrtomacaddr8' },
+  proname => 'macaddr8', proleakproof => 't', prorettype => 'macaddr8',
+  proargtypes => 'macaddr', prosrc => 'macaddrtomacaddr8' },
 { oid => '4124', descr => 'convert macaddr8 to macaddr',
   proname => 'macaddr', prorettype => 'macaddr', proargtypes => 'macaddr8',
   prosrc => 'macaddr8tomacaddr' },
@@ -4321,8 +4321,8 @@
   proname => 'trim_scale', prorettype => 'numeric', proargtypes => 'numeric',
   prosrc => 'numeric_trim_scale' },
 { oid => '1740', descr => 'convert int4 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'int4',
-  prosrc => 'int4_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'int4', prosrc => 'int4_numeric' },
 { oid => '1741', descr => 'base 10 logarithm',
   proname => 'log', prolang => 'sql', prorettype => 'numeric',
   proargtypes => 'numeric', prosrc => 'select pg_catalog.log(10, $1)' },
@@ -4330,11 +4330,11 @@
   proname => 'log10', prolang => 'sql', prorettype => 'numeric',
   proargtypes => 'numeric', prosrc => 'select pg_catalog.log(10, $1)' },
 { oid => '1742', descr => 'convert float4 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'float4',
-  prosrc => 'float4_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'float4', prosrc => 'float4_numeric' },
 { oid => '1743', descr => 'convert float8 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'float8',
-  prosrc => 'float8_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'float8', prosrc => 'float8_numeric' },
 { oid => '1744', descr => 'convert numeric to int4',
   proname => 'int4', prorettype => 'int4', proargtypes => 'numeric',
   prosrc => 'numeric_int4' },
@@ -4390,11 +4390,11 @@
   proname => 'int8', prorettype => 'int8', proargtypes => 'numeric',
   prosrc => 'numeric_int8' },
 { oid => '1781', descr => 'convert int8 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'int8',
-  prosrc => 'int8_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'int8', prosrc => 'int8_numeric' },
 { oid => '1782', descr => 'convert int2 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'int2',
-  prosrc => 'int2_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'int2', prosrc => 'int2_numeric' },
 { oid => '1783', descr => 'convert numeric to int2',
   proname => 'int2', prorettype => 'int2', proargtypes => 'numeric',
   prosrc => 'numeric_int2' },
@@ -7755,7 +7755,8 @@
 { oid => '2510', descr => 'get the prepared statements for this session',
   proname => 'pg_prepared_statement', prorows => '1000', proretset => 't',
   provolatile => 's', proparallel => 'r', prorettype => 'record',
-  proargtypes => '', proallargtypes => '{text,text,timestamptz,_regtype,bool,int8,int8}',
+  proargtypes => '',
+  proallargtypes => '{text,text,timestamptz,_regtype,bool,int8,int8}',
   proargmodes => '{o,o,o,o,o,o,o}',
   proargnames => '{name,statement,prepare_time,parameter_types,from_sql,generic_plans,custom_plans}',
   prosrc => 'pg_prepared_statement' },
@@ -7933,11 +7934,11 @@
   prosrc => 'pg_tablespace_databases' },
 
 { oid => '2557', descr => 'convert int4 to boolean',
-  proname => 'bool', prorettype => 'bool', proargtypes => 'int4',
-  prosrc => 'int4_bool' },
+  proname => 'bool', proleakproof => 't', prorettype => 'bool',
+  proargtypes => 'int4', prosrc => 'int4_bool' },
 { oid => '2558', descr => 'convert boolean to int4',
-  proname => 'int4', prorettype => 'int4', proargtypes => 'bool',
-  prosrc => 'bool_int4' },
+  proname => 'int4', proleakproof => 't', prorettype => 'int4',
+  proargtypes => 'bool', prosrc => 'bool_int4' },
 { oid => '2559', descr => 'current value from last used sequence',
   proname => 'lastval', provolatile => 'v', proparallel => 'u',
   prorettype => 'int8', proargtypes => '', prosrc => 'lastval' },
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 27056d70d36dd..1b3c146e4cc95 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -572,6 +572,8 @@ int24ge(smallint,integer)
 int42ge(integer,smallint)
 oideq(oid,oid)
 oidne(oid,oid)
+float8(smallint)
+float4(smallint)
 nameeqtext(name,text)
 namelttext(name,text)
 nameletext(name,text)
@@ -610,6 +612,10 @@ float84lt(double precision,real)
 float84le(double precision,real)
 float84gt(double precision,real)
 float84ge(double precision,real)
+float8(real)
+int4(smallint)
+float8(integer)
+float4(integer)
 btint2cmp(smallint,smallint)
 btint4cmp(integer,integer)
 btfloat4cmp(real,real)
@@ -620,6 +626,9 @@ btnamecmp(name,name)
 bttextcmp(text,text)
 cash_cmp(money,money)
 btoidvectorcmp(oidvector,oidvector)
+text(name)
+name(text)
+name(character)
 text_larger(text,text)
 text_smaller(text,text)
 int8eq(bigint,bigint)
@@ -634,7 +643,10 @@ int84lt(bigint,integer)
 int84gt(bigint,integer)
 int84le(bigint,integer)
 int84ge(bigint,integer)
+int8(integer)
+float8(bigint)
 oidvectorne(oidvector,oidvector)
+float4(bigint)
 namelt(name,name)
 namele(name,name)
 namegt(name,name)
@@ -651,6 +663,7 @@ text_lt(text,text)
 text_le(text,text)
 text_gt(text,text)
 text_ge(text,text)
+int8(smallint)
 macaddr_eq(macaddr,macaddr)
 macaddr_lt(macaddr,macaddr)
 macaddr_le(macaddr,macaddr)
@@ -716,6 +729,7 @@ interval_ge(interval,interval)
 interval_gt(interval,interval)
 charlt("char","char")
 tidne(tid,tid)
+int8(oid)
 tideq(tid,tid)
 timestamptz_cmp(timestamp with time zone,timestamp with time zone)
 interval_cmp(interval,interval)
@@ -727,6 +741,9 @@ timetz_le(time with time zone,time with time zone)
 timetz_ge(time with time zone,time with time zone)
 timetz_gt(time with time zone,time with time zone)
 timetz_cmp(time with time zone,time with time zone)
+"interval"(time without time zone)
+name(character varying)
+"varchar"(name)
 circle_eq(circle,circle)
 circle_ne(circle,circle)
 circle_lt(circle,circle)
@@ -757,6 +774,11 @@ varbitcmp(bit varying,bit varying)
 boolle(boolean,boolean)
 boolge(boolean,boolean)
 btboolcmp(boolean,boolean)
+"numeric"(integer)
+"numeric"(real)
+"numeric"(double precision)
+"numeric"(bigint)
+"numeric"(smallint)
 int28eq(smallint,bigint)
 int28ne(smallint,bigint)
 int28lt(smallint,bigint)
@@ -803,6 +825,8 @@ btfloat48cmp(real,double precision)
 btfloat84cmp(double precision,real)
 md5(text)
 md5(bytea)
+bool(integer)
+int4(boolean)
 tidgt(tid,tid)
 tidlt(tid,tid)
 tidge(tid,tid)
@@ -837,6 +861,7 @@ macaddr8_gt(macaddr8,macaddr8)
 macaddr8_ge(macaddr8,macaddr8)
 macaddr8_ne(macaddr8,macaddr8)
 macaddr8_cmp(macaddr8,macaddr8)
+macaddr8(macaddr)
 xid8lt(xid8,xid8)
 xid8gt(xid8,xid8)
 xid8le(xid8,xid8)

From 0a0727ccfc5f4e2926623abe877bdc0b5bfd682e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 25 Jul 2020 16:34:35 -0400
Subject: [PATCH 12/54] Improve performance of binary COPY FROM through better
 buffering.

At least on Linux and macOS, fread() turns out to have far higher
per-call overhead than one could wish.  Reading 64KB of data at a time
and then parceling it out with our own memcpy logic makes binary COPY
from a file significantly faster --- around 30% in simple testing for
cases with narrow text columns (on Linux ... even more on macOS).

In binary COPY from frontend, there's no per-call fread(), and this
patch introduces an extra layer of memcpy'ing, but it still manages
to eke out a small win.  Apparently, the control-logic overhead in
CopyGetData() is enough to be worth avoiding for small fetches.

Bharath Rupireddy and Amit Langote, reviewed by Vignesh C,
cosmetic tweaks by me

Discussion: https://postgr.es/m/CALj2ACU5Bz06HWLwqSzNMN=Gupoj6Rcn_QVC+k070V4em9wu=A@mail.gmail.com
---
 src/backend/commands/copy.c | 118 +++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 35 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 44da71c4cb5ca..db7d24a511e3b 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -187,15 +187,15 @@ typedef struct CopyStateData
 	TransitionCaptureState *transition_capture;
 
 	/*
-	 * These variables are used to reduce overhead in textual COPY FROM.
+	 * These variables are used to reduce overhead in COPY FROM.
 	 *
 	 * attribute_buf holds the separated, de-escaped text for each field of
 	 * the current line.  The CopyReadAttributes functions return arrays of
 	 * pointers into this buffer.  We avoid palloc/pfree overhead by re-using
 	 * the buffer on each cycle.
 	 *
-	 * (In binary COPY FROM, attribute_buf holds the binary data for the
-	 * current field, while the other variables are not used.)
+	 * In binary COPY FROM, attribute_buf holds the binary data for the
+	 * current field, but the usage is otherwise similar.
 	 */
 	StringInfoData attribute_buf;
 
@@ -209,7 +209,8 @@ typedef struct CopyStateData
 	 * input cycle is first to read the whole line into line_buf, convert it
 	 * to server encoding there, and then extract the individual attribute
 	 * fields into attribute_buf.  line_buf is preserved unmodified so that we
-	 * can display it in error messages if appropriate.
+	 * can display it in error messages if appropriate.  (In binary mode,
+	 * line_buf is not used.)
 	 */
 	StringInfoData line_buf;
 	bool		line_buf_converted; /* converted to server encoding? */
@@ -217,15 +218,18 @@ typedef struct CopyStateData
 
 	/*
 	 * Finally, raw_buf holds raw data read from the data source (file or
-	 * client connection).  CopyReadLine parses this data sufficiently to
-	 * locate line boundaries, then transfers the data to line_buf and
-	 * converts it.  Note: we guarantee that there is a \0 at
-	 * raw_buf[raw_buf_len].
+	 * client connection).  In text mode, CopyReadLine parses this data
+	 * sufficiently to locate line boundaries, then transfers the data to
+	 * line_buf and converts it.  In binary mode, CopyReadBinaryData fetches
+	 * appropriate amounts of data from this buffer.  In both modes, we
+	 * guarantee that there is a \0 at raw_buf[raw_buf_len].
 	 */
 #define RAW_BUF_SIZE 65536		/* we palloc RAW_BUF_SIZE+1 bytes */
 	char	   *raw_buf;
 	int			raw_buf_index;	/* next byte to process */
 	int			raw_buf_len;	/* total # of bytes stored */
+	/* Shorthand for number of unconsumed bytes available in raw_buf */
+#define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
 } CopyStateData;
 
 /* DestReceiver for COPY (query) TO */
@@ -394,6 +398,8 @@ static void CopySendInt32(CopyState cstate, int32 val);
 static bool CopyGetInt32(CopyState cstate, int32 *val);
 static void CopySendInt16(CopyState cstate, int16 val);
 static bool CopyGetInt16(CopyState cstate, int16 *val);
+static bool CopyLoadRawBuf(CopyState cstate);
+static int	CopyReadBinaryData(CopyState cstate, char *dest, int nbytes);
 
 
 /*
@@ -723,7 +729,7 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread)
 /*
  * CopySendInt32 sends an int32 in network byte order
  */
-static void
+static inline void
 CopySendInt32(CopyState cstate, int32 val)
 {
 	uint32		buf;
@@ -737,12 +743,12 @@ CopySendInt32(CopyState cstate, int32 val)
  *
  * Returns true if OK, false if EOF
  */
-static bool
+static inline bool
 CopyGetInt32(CopyState cstate, int32 *val)
 {
 	uint32		buf;
 
-	if (CopyGetData(cstate, &buf, sizeof(buf), sizeof(buf)) != sizeof(buf))
+	if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
 	{
 		*val = 0;				/* suppress compiler warning */
 		return false;
@@ -754,7 +760,7 @@ CopyGetInt32(CopyState cstate, int32 *val)
 /*
  * CopySendInt16 sends an int16 in network byte order
  */
-static void
+static inline void
 CopySendInt16(CopyState cstate, int16 val)
 {
 	uint16		buf;
@@ -766,12 +772,12 @@ CopySendInt16(CopyState cstate, int16 val)
 /*
  * CopyGetInt16 reads an int16 that appears in network byte order
  */
-static bool
+static inline bool
 CopyGetInt16(CopyState cstate, int16 *val)
 {
 	uint16		buf;
 
-	if (CopyGetData(cstate, &buf, sizeof(buf), sizeof(buf)) != sizeof(buf))
+	if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
 	{
 		*val = 0;				/* suppress compiler warning */
 		return false;
@@ -786,26 +792,20 @@ CopyGetInt16(CopyState cstate, int16 *val)
  *
  * Returns true if able to obtain at least one more byte, else false.
  *
- * If raw_buf_index < raw_buf_len, the unprocessed bytes are transferred
- * down to the start of the buffer and then we load more data after that.
- * This case is used only when a frontend multibyte character crosses a
- * bufferload boundary.
+ * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
+ * of the buffer and then we load more data after that.  This case occurs only
+ * when a multibyte character crosses a bufferload boundary.
  */
 static bool
 CopyLoadRawBuf(CopyState cstate)
 {
-	int			nbytes;
+	int			nbytes = RAW_BUF_BYTES(cstate);
 	int			inbytes;
 
-	if (cstate->raw_buf_index < cstate->raw_buf_len)
-	{
-		/* Copy down the unprocessed data */
-		nbytes = cstate->raw_buf_len - cstate->raw_buf_index;
+	/* Copy down the unprocessed data if any. */
+	if (nbytes > 0)
 		memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
 				nbytes);
-	}
-	else
-		nbytes = 0;				/* no data need be saved */
 
 	inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes,
 						  1, RAW_BUF_SIZE - nbytes);
@@ -816,6 +816,54 @@ CopyLoadRawBuf(CopyState cstate)
 	return (inbytes > 0);
 }
 
+/*
+ * CopyReadBinaryData
+ *
+ * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
+ * and writes them to 'dest'.  Returns the number of bytes read (which
+ * would be less than 'nbytes' only if we reach EOF).
+ */
+static int
+CopyReadBinaryData(CopyState cstate, char *dest, int nbytes)
+{
+	int			copied_bytes = 0;
+
+	if (RAW_BUF_BYTES(cstate) >= nbytes)
+	{
+		/* Enough bytes are present in the buffer. */
+		memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
+		cstate->raw_buf_index += nbytes;
+		copied_bytes = nbytes;
+	}
+	else
+	{
+		/*
+		 * Not enough bytes in the buffer, so must read from the file.  Need
+		 * to loop since 'nbytes' could be larger than the buffer size.
+		 */
+		do
+		{
+			int			copy_bytes;
+
+			/* Load more data if buffer is empty. */
+			if (RAW_BUF_BYTES(cstate) == 0)
+			{
+				if (!CopyLoadRawBuf(cstate))
+					break;		/* EOF */
+			}
+
+			/* Transfer some bytes. */
+			copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
+			memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
+			cstate->raw_buf_index += copy_bytes;
+			dest += copy_bytes;
+			copied_bytes += copy_bytes;
+		} while (copied_bytes < nbytes);
+	}
+
+	return copied_bytes;
+}
+
 
 /*
  *	 DoCopy executes the SQL COPY statement
@@ -3366,17 +3414,17 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attval = NULL;
 
 	/*
-	 * Set up variables to avoid per-attribute overhead.  attribute_buf is
-	 * used in both text and binary modes, but we use line_buf and raw_buf
+	 * Set up variables to avoid per-attribute overhead.  attribute_buf and
+	 * raw_buf are used in both text and binary modes, but we use line_buf
 	 * only in text mode.
 	 */
 	initStringInfo(&cstate->attribute_buf);
+	cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
+	cstate->raw_buf_index = cstate->raw_buf_len = 0;
 	if (!cstate->binary)
 	{
 		initStringInfo(&cstate->line_buf);
 		cstate->line_buf_converted = false;
-		cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
-		cstate->raw_buf_index = cstate->raw_buf_len = 0;
 	}
 
 	/* Assign range table, we'll need it in CopyFrom. */
@@ -3527,7 +3575,7 @@ BeginCopyFrom(ParseState *pstate,
 		int32		tmp;
 
 		/* Signature */
-		if (CopyGetData(cstate, readSig, 11, 11) != 11 ||
+		if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
 			memcmp(readSig, BinarySignature, 11) != 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
@@ -3555,7 +3603,7 @@ BeginCopyFrom(ParseState *pstate,
 		/* Skip extension header, if present */
 		while (tmp-- > 0)
 		{
-			if (CopyGetData(cstate, readSig, 1, 1) != 1)
+			if (CopyReadBinaryData(cstate, readSig, 1) != 1)
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 						 errmsg("invalid COPY file header (wrong length)")));
@@ -3771,7 +3819,7 @@ NextCopyFrom(CopyState cstate, ExprContext *econtext,
 			char		dummy;
 
 			if (cstate->copy_dest != COPY_OLD_FE &&
-				CopyGetData(cstate, &dummy, 1, 1) > 0)
+				CopyReadBinaryData(cstate, &dummy, 1) > 0)
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 						 errmsg("received copy data after EOF marker")));
@@ -4744,8 +4792,8 @@ CopyReadBinaryAttribute(CopyState cstate, FmgrInfo *flinfo,
 	resetStringInfo(&cstate->attribute_buf);
 
 	enlargeStringInfo(&cstate->attribute_buf, fld_size);
-	if (CopyGetData(cstate, cstate->attribute_buf.data,
-					fld_size, fld_size) != fld_size)
+	if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
+						   fld_size) != fld_size)
 		ereport(ERROR,
 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 				 errmsg("unexpected EOF in COPY data")));

From ce4939ff70890fa658a4095b9fe457f8432b2575 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 25 Jul 2020 14:50:59 -0700
Subject: [PATCH 13/54] Use RAND_poll() for seeding randomness after fork().

OpenSSL deprecated RAND_cleanup(), and OpenSSL 1.1.0 made it into a
no-op.  Replace it with RAND_poll(), per an OpenSSL community
recommendation.  While this has no user-visible consequences under
OpenSSL defaults, it might help under non-default settings.

Daniel Gustafsson, reviewed by David Steele and Michael Paquier.

Discussion: https://postgr.es/m/9B038FA5-23E8-40D0-B932-D515E1D8F66A@yesql.se
---
 src/backend/postmaster/fork_process.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/postmaster/fork_process.c b/src/backend/postmaster/fork_process.c
index def3cee37e2d7..15d6340800781 100644
--- a/src/backend/postmaster/fork_process.c
+++ b/src/backend/postmaster/fork_process.c
@@ -109,10 +109,12 @@ fork_process(void)
 		}
 
 		/*
-		 * Make sure processes do not share OpenSSL randomness state.
+		 * Make sure processes do not share OpenSSL randomness state. This is
+		 * no longer required in OpenSSL 1.1.1 and later versions, but until
+		 * we drop support for version < 1.1.1 we need to do this.
 		 */
 #ifdef USE_OPENSSL
-		RAND_cleanup();
+		RAND_poll();
 #endif
 	}
 

From 15e441972276e95639f8c3d9f5f66c2318fe9348 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 25 Jul 2020 14:50:59 -0700
Subject: [PATCH 14/54] Remove optimization for RAND_poll() failing.

The loop to generate seed data will exit on RAND_status(), so we don't
need to handle the case of RAND_poll() failing separately.  Failures
here are rare, so this a code cleanup, essentially.

Daniel Gustafsson, reviewed by David Steele and Michael Paquier.

Discussion: https://postgr.es/m/9B038FA5-23E8-40D0-B932-D515E1D8F66A@yesql.se
---
 src/port/pg_strong_random.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/port/pg_strong_random.c b/src/port/pg_strong_random.c
index eed8b87808948..14e8382cd8952 100644
--- a/src/port/pg_strong_random.c
+++ b/src/port/pg_strong_random.c
@@ -108,7 +108,11 @@ pg_strong_random(void *buf, size_t len)
 	/*
 	 * Check that OpenSSL's CSPRNG has been sufficiently seeded, and if not
 	 * add more seed data using RAND_poll().  With some older versions of
-	 * OpenSSL, it may be necessary to call RAND_poll() a number of times.
+	 * OpenSSL, it may be necessary to call RAND_poll() a number of times.  If
+	 * RAND_poll() fails to generate seed data within the given amount of
+	 * retries, subsequent RAND_bytes() calls will fail, but we allow that to
+	 * happen to let pg_strong_random() callers handle that with appropriate
+	 * error handling.
 	 */
 #define NUM_RAND_POLL_RETRIES 8
 
@@ -120,16 +124,7 @@ pg_strong_random(void *buf, size_t len)
 			break;
 		}
 
-		if (RAND_poll() == 0)
-		{
-			/*
-			 * RAND_poll() failed to generate any seed data, which means that
-			 * RAND_bytes() will probably fail.  For now, just fall through
-			 * and let that happen.  XXX: maybe we could seed it some other
-			 * way.
-			 */
-			break;
-		}
+		RAND_poll();
 	}
 
 	if (RAND_bytes(buf, len) == 1)

From 11a68e4b53ffccf336a2faf5fa380acda28e880b Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sun, 26 Jul 2020 16:32:11 +0900
Subject: [PATCH 15/54] Tweak behavior of pg_stat_activity.leader_pid
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The initial implementation of leader_pid in pg_stat_activity added by
b025f32 took the approach to strictly print what a PGPROC entry
includes.  In short, if a backend has been involved in parallel query at
least once, leader_pid would remain set as long as the backend is alive.
For a parallel group leader, this means that the field would always be
set after it participated at least once in parallel query, and after
more discussions this could be confusing if using for example a
connection pooler.

This commit changes the data printed so as leader_pid becomes always
NULL for a parallel group leader, showing up a non-NULL value only for
the parallel workers, and actually as long as a parallel query is
running as workers are shut down once the query has completed.

This does not change the definition of any catalog, so no catalog bump
is needed.  Per discussion with Justin Pryzby, Álvaro Herrera, Julien
Rouhaud and me.

Discussion: https://postgr.es/m/20200721035145.GB17300@paquier.xyz
Backpatch-through: 13
---
 doc/src/sgml/monitoring.sgml        | 9 +++------
 src/backend/utils/adt/pgstatfuncs.c | 8 +++++++-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index dc49177c78c13..7dcddf478a112 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -687,12 +687,9 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
        <structfield>leader_pid</structfield> <type>integer</type>
       </para>
       <para>
-       Process ID of the parallel group leader if this process is or
-       has been involved in parallel query, or null. This field is set
-       when a process wants to cooperate with parallel workers, and
-       remains set as long as the process exists. For a parallel group leader,
-       this field is set to its own process ID. For a parallel worker,
-       this field is set to the process ID of the parallel group leader.
+       Process ID of the parallel group leader, if this process is a
+       parallel query worker.  <literal>NULL</literal> if this process is a
+       parallel group leader or does not participate in parallel query.
       </para></entry>
      </row>
 
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 2aff739466ff1..95738a4e34eec 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -735,7 +735,13 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 				wait_event = pgstat_get_wait_event(raw_wait_event);
 
 				leader = proc->lockGroupLeader;
-				if (leader)
+
+				/*
+				 * Show the leader only for active parallel workers.  This
+				 * leaves the field as NULL for the leader of a parallel
+				 * group.
+				 */
+				if (leader && leader->pid != beentry->st_procpid)
 				{
 					values[29] = Int32GetDatum(leader->pid);
 					nulls[29] = false;

From 56788d2156fc32bd5737e7ac716d70e6a269b7bc Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Sun, 26 Jul 2020 21:02:45 +1200
Subject: [PATCH 16/54] Allocate consecutive blocks during parallel seqscans

Previously we would allocate blocks to parallel workers during a parallel
sequential scan 1 block at a time.  Since other workers were likely to
request a block before a worker returns for another block number to work
on, this could lead to non-sequential I/O patterns in each worker which
could cause the operating system's readahead to perform poorly or not at
all.

Here we change things so that we allocate consecutive "chunks" of blocks
to workers and have them work on those until they're done, at which time
we allocate another chunk for the worker.  The size of these chunks is
based on the size of the relation.

Initial patch here was by Thomas Munro which showed some good improvements
just having a fixed chunk size of 64 blocks with a simple ramp-down near
the end of the scan. The revisions of the patch to make the chunk size
based on the relation size and the adjusted ramp-down in powers of two was
done by me, along with quite extensive benchmarking to determine the
optimal chunk sizes.

For the most part, benchmarks have shown significant performance
improvements for large parallel sequential scans on Linux, FreeBSD and
Windows using SSDs.  It's less clear how this affects the performance of
cloud providers.  Tests done so far are unable to obtain stable enough
performance to provide meaningful benchmark results.  It is possible that
this could cause some performance regressions on more obscure filesystems,
so we may need to later provide users with some ability to get something
closer to the old behavior.  For now, let's leave that until we see that
it's really required.

Author: Thomas Munro, David Rowley
Reviewed-by: Ranier Vilela, Soumyadeep Chakraborty, Robert Haas
Reviewed-by: Amit Kapila, Kirk Jamison
Discussion: https://postgr.es/m/CA+hUKGJ_EErDv41YycXcbMbCBkztA34+z1ts9VQH+ACRuvpxig@mail.gmail.com
---
 src/backend/access/heap/heapam.c   |  22 ++++--
 src/backend/access/table/tableam.c | 118 +++++++++++++++++++++++++++--
 src/include/access/relscan.h       |  14 +++-
 src/include/access/tableam.h       |   2 +
 4 files changed, 144 insertions(+), 12 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index d881f4cd46a54..2c9bb0c7ee248 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
 			{
 				ParallelBlockTableScanDesc pbscan =
 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+				ParallelBlockTableScanWorker pbscanwork =
+				(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				/* Other processes might have already finished the scan. */
 				if (page == InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
 		{
 			ParallelBlockTableScanDesc pbscan =
 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+			ParallelBlockTableScanWorker pbscanwork =
+			(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-													 pbscan);
+													 pbscanwork, pbscan);
 			finished = (page == InvalidBlockNumber);
 		}
 		else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
 			{
 				ParallelBlockTableScanDesc pbscan =
 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+				ParallelBlockTableScanWorker pbscanwork =
+				(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				/* Other processes might have already finished the scan. */
 				if (page == InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
 		{
 			ParallelBlockTableScanDesc pbscan =
 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+			ParallelBlockTableScanWorker pbscanwork =
+			(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-													 pbscan);
+													 pbscanwork, pbscan);
 			finished = (page == InvalidBlockNumber);
 		}
 		else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_base.rs_nkeys = nkeys;
 	scan->rs_base.rs_flags = flags;
 	scan->rs_base.rs_parallel = parallel_scan;
+	scan->rs_base.rs_private =
+		palloc(sizeof(ParallelBlockTableScanWorkerData));
 	scan->rs_strategy = NULL;	/* set in initscan */
 
 	/*
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 4b2bb29559a72..4e8553de2afc5 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -25,10 +25,24 @@
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "optimizer/plancat.h"
+#include "port/pg_bitutils.h"
 #include "storage/bufmgr.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
 
+/*
+ * Constants to control the behavior of block allocation to parallel workers
+ * during a parallel seqscan.  Technically these values do not need to be
+ * powers of 2, but having them as powers of 2 makes the math more optimal
+ * and makes the ramp-down stepping more even.
+ */
+
+/* The number of I/O chunks we try to break a parallel seqscan down into */
+#define PARALLEL_SEQSCAN_NCHUNKS			2048
+/* Ramp down size of allocations when we've only this number of chunks left */
+#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS	64
+/* Cap the size of parallel I/O chunks to this number of blocks */
+#define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE		8192
 
 /* GUC variables */
 char	   *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
@@ -408,10 +422,37 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
  * to set the startblock once.
  */
 void
-table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_startblock_init(Relation rel,
+										 ParallelBlockTableScanWorker pbscanwork,
+										 ParallelBlockTableScanDesc pbscan)
 {
 	BlockNumber sync_startpage = InvalidBlockNumber;
 
+	/* Reset the state we use for controlling allocation size. */
+	memset(pbscanwork, 0, sizeof(*pbscanwork));
+
+	StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
+					 "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
+
+	/*
+	 * We determine the chunk size based on the size of the relation. First we
+	 * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
+	 * take the next highest power of 2 number of the chunk size.  This means
+	 * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
+	 * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
+	 */
+	pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
+													   PARALLEL_SEQSCAN_NCHUNKS, 1));
+
+	/*
+	 * Ensure we don't go over the maximum chunk size with larger tables. This
+	 * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
+	 * tables.  Too large a chunk size has been shown to be detrimental to
+	 * synchronous scan performance.
+	 */
+	pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
+									  PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
+
 retry:
 	/* Grab the spinlock. */
 	SpinLockAcquire(&pbscan->phs_mutex);
@@ -451,13 +492,40 @@ table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDes
  * backend gets an InvalidBlockNumber return.
  */
 BlockNumber
-table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_nextpage(Relation rel,
+								  ParallelBlockTableScanWorker pbscanwork,
+								  ParallelBlockTableScanDesc pbscan)
 {
 	BlockNumber page;
 	uint64		nallocated;
 
 	/*
-	 * phs_nallocated tracks how many pages have been allocated to workers
+	 * The logic below allocates block numbers out to parallel workers in a
+	 * way that each worker will receive a set of consecutive block numbers to
+	 * scan.  Earlier versions of this would allocate the next highest block
+	 * number to the next worker to call this function.  This would generally
+	 * result in workers never receiving consecutive block numbers.  Some
+	 * operating systems would not detect the sequential I/O pattern due to
+	 * each backend being a different process which could result in poor
+	 * performance due to inefficient or no readahead.  To work around this
+	 * issue, we now allocate a range of block numbers for each worker and
+	 * when they come back for another block, we give them the next one in
+	 * that range until the range is complete.  When the worker completes the
+	 * range of blocks we then allocate another range for it and return the
+	 * first block number from that range.
+	 *
+	 * Here we name these ranges of blocks "chunks".  The initial size of
+	 * these chunks is determined in table_block_parallelscan_startblock_init
+	 * based on the size of the relation.  Towards the end of the scan, we
+	 * start making reductions in the size of the chunks in order to attempt
+	 * to divide the remaining work over all the workers as evenly as
+	 * possible.
+	 *
+	 * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
+	 * the number of blocks remaining in the chunk.  When that reaches 0 then
+	 * we must allocate a new chunk for the worker.
+	 *
+	 * phs_nallocated tracks how many blocks have been allocated to workers
 	 * already.  When phs_nallocated >= rs_nblocks, all blocks have been
 	 * allocated.
 	 *
@@ -468,10 +536,50 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
 	 * wide because of that, to avoid wrapping around when rs_nblocks is close
 	 * to 2^32.
 	 *
-	 * The actual page to return is calculated by adding the counter to the
+	 * The actual block to return is calculated by adding the counter to the
 	 * starting block number, modulo nblocks.
 	 */
-	nallocated = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1);
+
+	/*
+	 * First check if we have any remaining blocks in a previous chunk for
+	 * this worker.  We must consume all of the blocks from that before we
+	 * allocate a new chunk to the worker.
+	 */
+	if (pbscanwork->phsw_chunk_remaining > 0)
+	{
+		/*
+		 * Give them the next block in the range and update the remaining
+		 * number of blocks.
+		 */
+		nallocated = ++pbscanwork->phsw_nallocated;
+		pbscanwork->phsw_chunk_remaining--;
+	}
+	else
+	{
+		/*
+		 * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
+		 * remaining in the scan, we half the chunk size.  Since we reduce the
+		 * chunk size here, we'll hit this again after doing
+		 * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
+		 * iterations of this, we'll end up doing the last few blocks with the
+		 * chunk size set to 1.
+		 */
+		if (pbscanwork->phsw_chunk_size > 1 &&
+			pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
+			(pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
+			pbscanwork->phsw_chunk_size >>= 1;
+
+		nallocated = pbscanwork->phsw_nallocated =
+			pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
+									pbscanwork->phsw_chunk_size);
+
+		/*
+		 * Set the remaining number of blocks in this chunk so that subsequent
+		 * calls from this worker continue on with this chunk until it's done.
+		 */
+		pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
+	}
+
 	if (nallocated >= pbscan->phs_nblocks)
 		page = InvalidBlockNumber;	/* all blocks have been allocated */
 	else
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f0258831f742..56459769519b3 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
 	 */
 	uint32		rs_flags;
 
+	void	   *rs_private;		/* per-worker private memory for AM to use */
 	struct ParallelTableScanDescData *rs_parallel;	/* parallel scan
 													 * information */
-
 } TableScanDescData;
 typedef struct TableScanDescData *TableScanDesc;
 
@@ -81,6 +81,18 @@ typedef struct ParallelBlockTableScanDescData
 }			ParallelBlockTableScanDescData;
 typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
 
+/*
+ * Per backend state for parallel table scan, for block-oriented storage.
+ */
+typedef struct ParallelBlockTableScanWorkerData
+{
+	uint64		phsw_nallocated;	/* Current # of blocks into the scan */
+	uint32		phsw_chunk_remaining;	/* # blocks left in this chunk */
+	uint32		phsw_chunk_size;	/* The number of blocks to allocate in
+									 * each I/O chunk for the scan */
+}			ParallelBlockTableScanWorkerData;
+typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
+
 /*
  * Base class for fetches from a table via an index. This is the base-class
  * for such scans, which needs to be embedded in the respective struct for
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 0d28f01ca9183..7ba72c84e0217 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -1793,8 +1793,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
 extern void table_block_parallelscan_reinitialize(Relation rel,
 												  ParallelTableScanDesc pscan);
 extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
+													 ParallelBlockTableScanWorker pbscanwork,
 													 ParallelBlockTableScanDesc pbscan);
 extern void table_block_parallelscan_startblock_init(Relation rel,
+													 ParallelBlockTableScanWorker pbscanwork,
 													 ParallelBlockTableScanDesc pbscan);
 
 

From 200f6100a9f9fc71273aeb6aceac4430f3437195 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Sun, 26 Jul 2020 14:55:52 -0700
Subject: [PATCH 17/54] Fix LookupTupleHashEntryHash() pipeline-stall issue.

Refactor hash lookups in nodeAgg.c to improve performance.

Author: Andres Freund and Jeff Davis
Discussion: https://postgr.es/m/20200612213715.op4ye4q7gktqvpuo%40alap3.anarazel.de
Backpatch-through: 13
---
 src/backend/executor/execGrouping.c       |  29 ++--
 src/backend/executor/nodeAgg.c            | 163 +++++++++++-----------
 src/backend/executor/nodeRecursiveunion.c |   4 +-
 src/backend/executor/nodeSetOp.c          |   4 +-
 src/backend/executor/nodeSubplan.c        |   4 +-
 src/include/executor/executor.h           |   2 +-
 6 files changed, 105 insertions(+), 101 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 019b87df21ecb..321f427e478fc 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -22,11 +22,11 @@
 #include "utils/memutils.h"
 
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
-static uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
-										  const MinimalTuple tuple);
-static TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
-													TupleTableSlot *slot,
-													bool *isnew, uint32 hash);
+static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
+												 const MinimalTuple tuple);
+static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
+														   TupleTableSlot *slot,
+														   bool *isnew, uint32 hash);
 
 /*
  * Define parameters for tuple hash table code generation. The interface is
@@ -291,6 +291,9 @@ ResetTupleHashTable(TupleHashTable hashtable)
  * If isnew is NULL, we do not create new entries; we return NULL if no
  * match is found.
  *
+ * If hash is not NULL, we set it to the calculated hash value. This allows
+ * callers access to the hash value even if no entry is returned.
+ *
  * If isnew isn't NULL, then a new entry is created if no existing entry
  * matches.  On return, *isnew is true if the entry is newly created,
  * false if it existed already.  ->additional_data in the new entry has
@@ -298,11 +301,11 @@ ResetTupleHashTable(TupleHashTable hashtable)
  */
 TupleHashEntry
 LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
-					 bool *isnew)
+					 bool *isnew, uint32 *hash)
 {
 	TupleHashEntry entry;
 	MemoryContext oldContext;
-	uint32		hash;
+	uint32		local_hash;
 
 	/* Need to run the hash functions in short-lived context */
 	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
@@ -312,8 +315,13 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
-	hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
-	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+	local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
+
+	if (hash != NULL)
+		*hash = local_hash;
+
+	Assert(entry == NULL || entry->hash == local_hash);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -362,6 +370,7 @@ LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
 	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+	Assert(entry == NULL || entry->hash == hash);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -480,7 +489,7 @@ TupleHashTableHash_internal(struct tuplehash_hash *tb,
  * NB: This function may or may not change the memory context. Caller is
  * expected to change it back.
  */
-static TupleHashEntry
+static inline TupleHashEntry
 LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 							  bool *isnew, uint32 hash)
 {
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index b79c845a6b739..bbfc4af1ec9cb 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -391,7 +391,9 @@ static void finalize_partialaggregate(AggState *aggstate,
 									  AggStatePerAgg peragg,
 									  AggStatePerGroup pergroupstate,
 									  Datum *resultVal, bool *resultIsNull);
-static void prepare_hash_slot(AggState *aggstate);
+static inline void prepare_hash_slot(AggStatePerHash perhash,
+									 TupleTableSlot *inputslot,
+									 TupleTableSlot *hashslot);
 static void prepare_projection_slot(AggState *aggstate,
 									TupleTableSlot *slot,
 									int currentSet);
@@ -413,8 +415,9 @@ static int	hash_choose_num_partitions(uint64 input_groups,
 									   double hashentrysize,
 									   int used_bits,
 									   int *log2_npartittions);
-static AggStatePerGroup lookup_hash_entry(AggState *aggstate, uint32 hash,
-										  bool *in_hash_table);
+static void initialize_hash_entry(AggState *aggstate,
+								  TupleHashTable hashtable,
+								  TupleHashEntry entry);
 static void lookup_hash_entries(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
 static void agg_fill_hash_table(AggState *aggstate);
@@ -1207,12 +1210,11 @@ finalize_partialaggregate(AggState *aggstate,
  * Extract the attributes that make up the grouping key into the
  * hashslot. This is necessary to compute the hash or perform a lookup.
  */
-static void
-prepare_hash_slot(AggState *aggstate)
+static inline void
+prepare_hash_slot(AggStatePerHash perhash,
+				  TupleTableSlot *inputslot,
+				  TupleTableSlot *hashslot)
 {
-	TupleTableSlot *inputslot = aggstate->tmpcontext->ecxt_outertuple;
-	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
-	TupleTableSlot *hashslot = perhash->hashslot;
 	int			i;
 
 	/* transfer just the needed columns into hashslot */
@@ -2013,75 +2015,39 @@ hash_choose_num_partitions(uint64 input_groups, double hashentrysize,
 }
 
 /*
- * Find or create a hashtable entry for the tuple group containing the current
- * tuple (already set in tmpcontext's outertuple slot), in the current grouping
- * set (which the caller must have selected - note that initialize_aggregate
- * depends on this).
- *
- * When called, CurrentMemoryContext should be the per-query context. The
- * already-calculated hash value for the tuple must be specified.
- *
- * If in "spill mode", then only find existing hashtable entries; don't create
- * new ones. If a tuple's group is not already present in the hash table for
- * the current grouping set, assign *in_hash_table=false and the caller will
- * spill it to disk.
+ * Initialize a freshly-created TupleHashEntry.
  */
-static AggStatePerGroup
-lookup_hash_entry(AggState *aggstate, uint32 hash, bool *in_hash_table)
+static void
+initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
+					  TupleHashEntry entry)
 {
-	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
-	TupleTableSlot *hashslot = perhash->hashslot;
-	TupleHashEntryData *entry;
-	bool		isnew = false;
-	bool	   *p_isnew;
-
-	/* if hash table already spilled, don't create new entries */
-	p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
-
-	/* find or create the hashtable entry using the filtered tuple */
-	entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, p_isnew,
-									 hash);
-
-	if (entry == NULL)
-	{
-		*in_hash_table = false;
-		return NULL;
-	}
-	else
-		*in_hash_table = true;
-
-	if (isnew)
-	{
-		AggStatePerGroup pergroup;
-		int			transno;
+	AggStatePerGroup pergroup;
+	int			transno;
 
-		aggstate->hash_ngroups_current++;
-		hash_agg_check_limits(aggstate);
+	aggstate->hash_ngroups_current++;
+	hash_agg_check_limits(aggstate);
 
-		/* no need to allocate or initialize per-group state */
-		if (aggstate->numtrans == 0)
-			return NULL;
+	/* no need to allocate or initialize per-group state */
+	if (aggstate->numtrans == 0)
+		return;
 
-		pergroup = (AggStatePerGroup)
-			MemoryContextAlloc(perhash->hashtable->tablecxt,
-							   sizeof(AggStatePerGroupData) * aggstate->numtrans);
+	pergroup = (AggStatePerGroup)
+		MemoryContextAlloc(hashtable->tablecxt,
+						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
 
-		entry->additional = pergroup;
+	entry->additional = pergroup;
 
-		/*
-		 * Initialize aggregates for new tuple group, lookup_hash_entries()
-		 * already has selected the relevant grouping set.
-		 */
-		for (transno = 0; transno < aggstate->numtrans; transno++)
-		{
-			AggStatePerTrans pertrans = &aggstate->pertrans[transno];
-			AggStatePerGroup pergroupstate = &pergroup[transno];
+	/*
+	 * Initialize aggregates for new tuple group, lookup_hash_entries()
+	 * already has selected the relevant grouping set.
+	 */
+	for (transno = 0; transno < aggstate->numtrans; transno++)
+	{
+		AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+		AggStatePerGroup pergroupstate = &pergroup[transno];
 
-			initialize_aggregate(aggstate, pertrans, pergroupstate);
-		}
+		initialize_aggregate(aggstate, pertrans, pergroupstate);
 	}
-
-	return entry->additional;
 }
 
 /*
@@ -2106,21 +2072,37 @@ static void
 lookup_hash_entries(AggState *aggstate)
 {
 	AggStatePerGroup *pergroup = aggstate->hash_pergroup;
+	TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
 	int			setno;
 
 	for (setno = 0; setno < aggstate->num_hashes; setno++)
 	{
 		AggStatePerHash perhash = &aggstate->perhash[setno];
+		TupleHashTable hashtable = perhash->hashtable;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry entry;
 		uint32		hash;
-		bool		in_hash_table;
+		bool		isnew = false;
+		bool	   *p_isnew;
+
+		/* if hash table already spilled, don't create new entries */
+		p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		select_current_set(aggstate, setno, true);
-		prepare_hash_slot(aggstate);
-		hash = TupleHashTableHash(perhash->hashtable, perhash->hashslot);
-		pergroup[setno] = lookup_hash_entry(aggstate, hash, &in_hash_table);
+		prepare_hash_slot(perhash,
+						  outerslot,
+						  hashslot);
+
+		entry = LookupTupleHashEntry(hashtable, hashslot,
+									 p_isnew, &hash);
 
-		/* check to see if we need to spill the tuple for this grouping set */
-		if (!in_hash_table)
+		if (entry != NULL)
+		{
+			if (isnew)
+				initialize_hash_entry(aggstate, hashtable, entry);
+			pergroup[setno] = entry->additional;
+		}
+		else
 		{
 			HashAggSpill *spill = &aggstate->hash_spills[setno];
 			TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
@@ -2131,6 +2113,7 @@ lookup_hash_entries(AggState *aggstate)
 								   aggstate->hashentrysize);
 
 			hashagg_spill_tuple(aggstate, spill, slot, hash);
+			pergroup[setno] = NULL;
 		}
 	}
 }
@@ -2588,6 +2571,7 @@ static bool
 agg_refill_hash_table(AggState *aggstate)
 {
 	HashAggBatch *batch;
+	AggStatePerHash perhash;
 	HashAggSpill spill;
 	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
 	uint64		ngroups_estimate;
@@ -2639,6 +2623,8 @@ agg_refill_hash_table(AggState *aggstate)
 
 	select_current_set(aggstate, batch->setno, true);
 
+	perhash = &aggstate->perhash[aggstate->current_set];
+
 	/*
 	 * Spilled tuples are always read back as MinimalTuples, which may be
 	 * different from the outer plan, so recompile the aggregate expressions.
@@ -2652,10 +2638,13 @@ agg_refill_hash_table(AggState *aggstate)
 							 HASHAGG_READ_BUFFER_SIZE);
 	for (;;)
 	{
-		TupleTableSlot *slot = aggstate->hash_spill_rslot;
+		TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry entry;
 		MinimalTuple tuple;
 		uint32		hash;
-		bool		in_hash_table;
+		bool		isnew = false;
+		bool	   *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -2663,16 +2652,20 @@ agg_refill_hash_table(AggState *aggstate)
 		if (tuple == NULL)
 			break;
 
-		ExecStoreMinimalTuple(tuple, slot, true);
-		aggstate->tmpcontext->ecxt_outertuple = slot;
+		ExecStoreMinimalTuple(tuple, spillslot, true);
+		aggstate->tmpcontext->ecxt_outertuple = spillslot;
 
-		prepare_hash_slot(aggstate);
-		aggstate->hash_pergroup[batch->setno] =
-			lookup_hash_entry(aggstate, hash, &in_hash_table);
+		prepare_hash_slot(perhash,
+						  aggstate->tmpcontext->ecxt_outertuple,
+						  hashslot);
+		entry = LookupTupleHashEntryHash(
+										 perhash->hashtable, hashslot, p_isnew, hash);
 
-		if (in_hash_table)
+		if (entry != NULL)
 		{
-			/* Advance the aggregates (or combine functions) */
+			if (isnew)
+				initialize_hash_entry(aggstate, perhash->hashtable, entry);
+			aggstate->hash_pergroup[batch->setno] = entry->additional;
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2688,7 +2681,9 @@ agg_refill_hash_table(AggState *aggstate)
 								   ngroups_estimate, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
-			hashagg_spill_tuple(aggstate, &spill, slot, hash);
+			hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
+
+			aggstate->hash_pergroup[batch->setno] = NULL;
 		}
 
 		/*
diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c
index 620414a1edcf9..046242682f017 100644
--- a/src/backend/executor/nodeRecursiveunion.c
+++ b/src/backend/executor/nodeRecursiveunion.c
@@ -94,7 +94,7 @@ ExecRecursiveUnion(PlanState *pstate)
 			if (plan->numCols > 0)
 			{
 				/* Find or build hashtable entry for this tuple's group */
-				LookupTupleHashEntry(node->hashtable, slot, &isnew);
+				LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 				/* Must reset temp context after each hashtable lookup */
 				MemoryContextReset(node->tempContext);
 				/* Ignore tuple if already seen */
@@ -141,7 +141,7 @@ ExecRecursiveUnion(PlanState *pstate)
 		if (plan->numCols > 0)
 		{
 			/* Find or build hashtable entry for this tuple's group */
-			LookupTupleHashEntry(node->hashtable, slot, &isnew);
+			LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 			/* Must reset temp context after each hashtable lookup */
 			MemoryContextReset(node->tempContext);
 			/* Ignore tuple if already seen */
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index bfd148a41a241..8d4ccff19cc62 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -381,7 +381,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* Find or build hashtable entry for this tuple's group */
 			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
-										 &isnew);
+										 &isnew, NULL);
 
 			/* If new tuple group, initialize counts */
 			if (isnew)
@@ -402,7 +402,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* For tuples not seen previously, do not make hashtable entry */
 			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
-										 NULL);
+										 NULL, NULL);
 
 			/* Advance the counts if entry is already present */
 			if (entry)
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 298b7757f57cc..38c2fc0b50b66 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -595,12 +595,12 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
 		 */
 		if (slotNoNulls(slot))
 		{
-			(void) LookupTupleHashEntry(node->hashtable, slot, &isnew);
+			(void) LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 			node->havehashrows = true;
 		}
 		else if (node->hashnulls)
 		{
-			(void) LookupTupleHashEntry(node->hashnulls, slot, &isnew);
+			(void) LookupTupleHashEntry(node->hashnulls, slot, &isnew, NULL);
 			node->havenullrows = true;
 		}
 
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index c7deeac662f6a..415e117407c96 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -139,7 +139,7 @@ extern TupleHashTable BuildTupleHashTableExt(PlanState *parent,
 											 MemoryContext tempcxt, bool use_variable_hash_iv);
 extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   TupleTableSlot *slot,
-										   bool *isnew);
+										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,

From e971357961f2bf5bddebb3f68ba8b55954709486 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 27 Jul 2020 10:28:06 +0900
Subject: [PATCH 18/54] Fix handling of structure for bytea data type in ECPG

Some code paths dedicated to bytea used the structure for varchar.  This
did not lead to any actual bugs, as bytea and varchar have the same
definition, but it could become a trap if one of these definitions
changes for a new feature or a bug fix.

Issue introduced by 050710b.

Author: Shenhao Wang
Reviewed-by: Vignesh C, Michael Paquier
Discussion: https://postgr.es/m/07ac7dee1efc44f99d7f53a074420177@G08CNEXMBPEKD06.g08.fujitsu.local
Backpatch-through: 12
---
 src/interfaces/ecpg/ecpglib/data.c       | 4 ++--
 src/interfaces/ecpg/ecpglib/descriptor.c | 4 ++--
 src/interfaces/ecpg/ecpglib/execute.c    | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/interfaces/ecpg/ecpglib/data.c b/src/interfaces/ecpg/ecpglib/data.c
index d3db5094cfa65..6bc91ef7eb6d9 100644
--- a/src/interfaces/ecpg/ecpglib/data.c
+++ b/src/interfaces/ecpg/ecpglib/data.c
@@ -523,8 +523,8 @@ ecpg_get_data(const PGresult *results, int act_tuple, int act_field, int lineno,
 
 				case ECPGt_bytea:
 					{
-						struct ECPGgeneric_varchar *variable =
-						(struct ECPGgeneric_varchar *) (var + offset * act_tuple);
+						struct ECPGgeneric_bytea *variable =
+						(struct ECPGgeneric_bytea *) (var + offset * act_tuple);
 						long		dst_size,
 									src_size,
 									dec_size;
diff --git a/src/interfaces/ecpg/ecpglib/descriptor.c b/src/interfaces/ecpg/ecpglib/descriptor.c
index f71f539bef9ee..369c2f0867ac9 100644
--- a/src/interfaces/ecpg/ecpglib/descriptor.c
+++ b/src/interfaces/ecpg/ecpglib/descriptor.c
@@ -591,8 +591,8 @@ set_desc_attr(struct descriptor_item *desc_item, struct variable *var,
 
 	else
 	{
-		struct ECPGgeneric_varchar *variable =
-		(struct ECPGgeneric_varchar *) (var->value);
+		struct ECPGgeneric_bytea *variable =
+		(struct ECPGgeneric_bytea *) (var->value);
 
 		desc_item->is_binary = true;
 		desc_item->data_len = variable->len;
diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c
index 6961d7c75b4dc..9d61ae72506f5 100644
--- a/src/interfaces/ecpg/ecpglib/execute.c
+++ b/src/interfaces/ecpg/ecpglib/execute.c
@@ -822,8 +822,8 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari
 
 			case ECPGt_bytea:
 				{
-					struct ECPGgeneric_varchar *variable =
-					(struct ECPGgeneric_varchar *) (var->value);
+					struct ECPGgeneric_bytea *variable =
+					(struct ECPGgeneric_bytea *) (var->value);
 
 					if (!(mallocedval = (char *) ecpg_alloc(variable->len, lineno)))
 						return false;
@@ -1401,7 +1401,7 @@ ecpg_build_params(struct statement *stmt)
 
 			if (var->type == ECPGt_bytea)
 			{
-				binary_length = ((struct ECPGgeneric_varchar *) (var->value))->len;
+				binary_length = ((struct ECPGgeneric_bytea *) (var->value))->len;
 				binary_format = true;
 			}
 		}

From a3ab7a707d9eda4b2162273348cba52252c0f0c9 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 27 Jul 2020 15:58:32 +0900
Subject: [PATCH 19/54] Fix corner case with 16kB-long decompression in
 pgcrypto, take 2

A compressed stream may end with an empty packet.  In this case
decompression finishes before reading the empty packet and the
remaining stream packet causes a failure in reading the following
data.  This commit makes sure to consume such extra data, avoiding a
failure when decompression the data.  This corner case was reproducible
easily with a data length of 16kB, and existed since e94dd6a.  A cheap
regression test is added to cover this case based on a random,
incompressible string.

The first attempt of this patch has allowed to find an older failure
within the compression logic of pgcrypto, fixed by b9b6105.  This
involved SLES 15 with z390 where a custom flavor of libz gets used.
Bonus thanks to Mark Wong for providing access to the specific
environment.

Reported-by: Frank Gagnepain
Author: Kyotaro Horiguchi, Michael Paquier
Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/16476-692ef7b84e5fb893@postgresql.org
Backpatch-through: 9.5
---
 contrib/pgcrypto/expected/pgp-compression.out | 30 +++++++++++++++++++
 contrib/pgcrypto/pgp-compress.c               | 21 +++++++++++++
 contrib/pgcrypto/sql/pgp-compression.sql      | 21 +++++++++++++
 3 files changed, 72 insertions(+)

diff --git a/contrib/pgcrypto/expected/pgp-compression.out b/contrib/pgcrypto/expected/pgp-compression.out
index 32b350b8fe05b..d4c57feba30b8 100644
--- a/contrib/pgcrypto/expected/pgp-compression.out
+++ b/contrib/pgcrypto/expected/pgp-compression.out
@@ -48,3 +48,33 @@ select pgp_sym_decrypt(
  Secret message
 (1 row)
 
+-- check corner case involving an input string of 16kB, as per bug #16476.
+SELECT setseed(0);
+ setseed 
+---------
+ 
+(1 row)
+
+WITH random_string AS
+(
+  -- This generates a random string of 16366 bytes.  This is chosen
+  -- as random so that it does not get compressed, and the decompression
+  -- would work on a string with the same length as the origin, making the
+  -- test behavior more predictible.  lpad() ensures that the generated
+  -- hexadecimal value is completed by extra zero characters if random()
+  -- has generated a value strictly lower than 16.
+  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
+    FROM generate_series(0, 16365)
+)
+SELECT bytes =
+    pgp_sym_decrypt_bytea(
+      pgp_sym_encrypt_bytea(bytes, 'key',
+                            'compress-algo=1,compress-level=1'),
+                            'key', 'expect-compress-algo=1')
+    AS is_same
+  FROM random_string;
+ is_same 
+---------
+ t
+(1 row)
+
diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c
index 4b1d2a1ff5f96..3636a662b0769 100644
--- a/contrib/pgcrypto/pgp-compress.c
+++ b/contrib/pgcrypto/pgp-compress.c
@@ -286,7 +286,28 @@ decompress_read(void *priv, PullFilter *src, int len,
 
 	dec->buf_data = dec->buf_len - dec->stream.avail_out;
 	if (res == Z_STREAM_END)
+	{
+		uint8	   *tmp;
+
+		/*
+		 * A stream must be terminated by a normal packet.  If the last stream
+		 * packet in the source stream is a full packet, a normal empty packet
+		 * must follow.  Since the underlying packet reader doesn't know that
+		 * the compressed stream has been ended, we need to to consume the
+		 * terminating packet here.  This read does not harm even if the
+		 * stream has already ended.
+		 */
+		res = pullf_read(src, 1, &tmp);
+
+		if (res < 0)
+			return res;
+		else if (res > 0)
+		{
+			px_debug("decompress_read: extra bytes after end of stream");
+			return PXE_PGP_CORRUPT_DATA;
+		}
 		dec->eof = 1;
+	}
 	goto restart;
 }
 
diff --git a/contrib/pgcrypto/sql/pgp-compression.sql b/contrib/pgcrypto/sql/pgp-compression.sql
index ca9ee1fc0088a..87c59c6cabc4e 100644
--- a/contrib/pgcrypto/sql/pgp-compression.sql
+++ b/contrib/pgcrypto/sql/pgp-compression.sql
@@ -28,3 +28,24 @@ select pgp_sym_decrypt(
 	pgp_sym_encrypt('Secret message', 'key',
 			'compress-algo=2, compress-level=0'),
 	'key', 'expect-compress-algo=0');
+
+-- check corner case involving an input string of 16kB, as per bug #16476.
+SELECT setseed(0);
+WITH random_string AS
+(
+  -- This generates a random string of 16366 bytes.  This is chosen
+  -- as random so that it does not get compressed, and the decompression
+  -- would work on a string with the same length as the origin, making the
+  -- test behavior more predictible.  lpad() ensures that the generated
+  -- hexadecimal value is completed by extra zero characters if random()
+  -- has generated a value strictly lower than 16.
+  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
+    FROM generate_series(0, 16365)
+)
+SELECT bytes =
+    pgp_sym_decrypt_bytea(
+      pgp_sym_encrypt_bytea(bytes, 'key',
+                            'compress-algo=1,compress-level=1'),
+                            'key', 'expect-compress-algo=1')
+    AS is_same
+  FROM random_string;

From bcbf9446a2983b6452c19cc50050456be262f7c5 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 27 Jul 2020 17:53:19 -0700
Subject: [PATCH 20/54] Remove hashagg_avoid_disk_plan GUC.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Note: This GUC was originally named enable_hashagg_disk when it appeared
in commit 1f39bce0, which added disk-based hash aggregation.  It was
subsequently renamed in commit 92c58fd9.

Author: Peter Geoghegan
Reviewed-By: Jeff Davis, Álvaro Herrera
Discussion: https://postgr.es/m/9d9d1e1252a52ea1bad84ea40dbebfd54e672a0f.camel%40j-davis.com
Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 doc/src/sgml/config.sgml              |  17 ---
 src/backend/optimizer/path/costsize.c |   1 -
 src/backend/optimizer/plan/planner.c  | 162 +++++++++-----------------
 src/backend/utils/misc/guc.c          |  10 --
 src/include/optimizer/cost.h          |   1 -
 5 files changed, 55 insertions(+), 136 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6ce59078967ce..822bbf1f27269 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4840,23 +4840,6 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-hashagg-avoid-disk-plan" xreflabel="hashagg_avoid_disk_plan">
-      <term><varname>hashagg_avoid_disk_plan</varname> (<type>boolean</type>)
-      <indexterm>
-       <primary><varname>hashagg_avoid_disk_plan</varname> configuration parameter</primary>
-      </indexterm>
-      </term>
-      <listitem>
-       <para>
-        If set to <literal>on</literal>, causes the planner to avoid choosing
-        hashed aggregation plans that are expected to use the disk. If hashed
-        aggregation is chosen, it may still require the use of disk at
-        execution time, even if this parameter is enabled. The default is
-        <literal>off</literal>.
-       </para>
-      </listitem>
-     </varlistentry>
-
      </variablelist>
      </sect2>
      <sect2 id="runtime-config-query-constants">
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 945aa93374811..27ce4cc8069b2 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -130,7 +130,6 @@ bool		enable_tidscan = true;
 bool		enable_sort = true;
 bool		enable_incremental_sort = true;
 bool		enable_hashagg = true;
-bool		hashagg_avoid_disk_plan = true;
 bool		enable_nestloop = true;
 bool		enable_material = true;
 bool		enable_mergejoin = true;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b406d41e91891..1345e522dcf5e 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4850,11 +4850,10 @@ create_distinct_paths(PlannerInfo *root,
 	 * Consider hash-based implementations of DISTINCT, if possible.
 	 *
 	 * If we were not able to make any other types of path, we *must* hash or
-	 * die trying.  If we do have other choices, there are several things that
+	 * die trying.  If we do have other choices, there are two things that
 	 * should prevent selection of hashing: if the query uses DISTINCT ON
 	 * (because it won't really have the expected behavior if we hash), or if
-	 * enable_hashagg is off, or if it looks like the hashtable will exceed
-	 * work_mem.
+	 * enable_hashagg is off.
 	 *
 	 * Note: grouping_is_hashable() is much more expensive to check than the
 	 * other gating conditions, so we want to do it last.
@@ -4864,12 +4863,7 @@ create_distinct_paths(PlannerInfo *root,
 	else if (parse->hasDistinctOn || !enable_hashagg)
 		allow_hash = false;		/* policy-based decision not to hash */
 	else
-	{
-		Size		hashentrysize = hash_agg_entry_size(0, cheapest_input_path->pathtarget->width, 0);
-
-		allow_hash = !hashagg_avoid_disk_plan ||
-			(hashentrysize * numDistinctRows <= work_mem * 1024L);
-	}
+		allow_hash = true;		/* default */
 
 	if (allow_hash && grouping_is_hashable(parse->distinctClause))
 	{
@@ -6749,8 +6743,6 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 
 	if (can_hash)
 	{
-		double		hashaggtablesize;
-
 		if (parse->groupingSets)
 		{
 			/*
@@ -6762,63 +6754,41 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 		}
 		else
 		{
-			hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
-														  agg_costs,
-														  dNumGroups);
-
 			/*
-			 * Provided that the estimated size of the hashtable does not
-			 * exceed work_mem, we'll generate a HashAgg Path, although if we
-			 * were unable to sort above, then we'd better generate a Path, so
-			 * that we at least have one.
+			 * Generate a HashAgg Path.  We just need an Agg over the
+			 * cheapest-total input path, since input order won't matter.
 			 */
-			if (!hashagg_avoid_disk_plan ||
-				hashaggtablesize < work_mem * 1024L ||
-				grouped_rel->pathlist == NIL)
-			{
-				/*
-				 * We just need an Agg over the cheapest-total input path,
-				 * since input order won't matter.
-				 */
-				add_path(grouped_rel, (Path *)
-						 create_agg_path(root, grouped_rel,
-										 cheapest_path,
-										 grouped_rel->reltarget,
-										 AGG_HASHED,
-										 AGGSPLIT_SIMPLE,
-										 parse->groupClause,
-										 havingQual,
-										 agg_costs,
-										 dNumGroups));
-			}
+			add_path(grouped_rel, (Path *)
+					 create_agg_path(root, grouped_rel,
+									 cheapest_path,
+									 grouped_rel->reltarget,
+									 AGG_HASHED,
+									 AGGSPLIT_SIMPLE,
+									 parse->groupClause,
+									 havingQual,
+									 agg_costs,
+									 dNumGroups));
 		}
 
 		/*
 		 * Generate a Finalize HashAgg Path atop of the cheapest partially
-		 * grouped path, assuming there is one. Once again, we'll only do this
-		 * if it looks as though the hash table won't exceed work_mem.
+		 * grouped path, assuming there is one
 		 */
 		if (partially_grouped_rel && partially_grouped_rel->pathlist)
 		{
 			Path	   *path = partially_grouped_rel->cheapest_total_path;
 
-			hashaggtablesize = estimate_hashagg_tablesize(path,
-														  agg_final_costs,
-														  dNumGroups);
-
-			if (!hashagg_avoid_disk_plan ||
-				hashaggtablesize < work_mem * 1024L)
-				add_path(grouped_rel, (Path *)
-						 create_agg_path(root,
-										 grouped_rel,
-										 path,
-										 grouped_rel->reltarget,
-										 AGG_HASHED,
-										 AGGSPLIT_FINAL_DESERIAL,
-										 parse->groupClause,
-										 havingQual,
-										 agg_final_costs,
-										 dNumGroups));
+			add_path(grouped_rel, (Path *)
+					 create_agg_path(root,
+									 grouped_rel,
+									 path,
+									 grouped_rel->reltarget,
+									 AGG_HASHED,
+									 AGGSPLIT_FINAL_DESERIAL,
+									 parse->groupClause,
+									 havingQual,
+									 agg_final_costs,
+									 dNumGroups));
 		}
 	}
 
@@ -7171,65 +7141,43 @@ create_partial_grouping_paths(PlannerInfo *root,
 		}
 	}
 
+	/*
+	 * Add a partially-grouped HashAgg Path where possible
+	 */
 	if (can_hash && cheapest_total_path != NULL)
 	{
-		double		hashaggtablesize;
-
 		/* Checked above */
 		Assert(parse->hasAggs || parse->groupClause);
 
-		hashaggtablesize =
-			estimate_hashagg_tablesize(cheapest_total_path,
-									   agg_partial_costs,
-									   dNumPartialGroups);
-
-		/*
-		 * Tentatively produce a partial HashAgg Path, depending on if it
-		 * looks as if the hash table will fit in work_mem.
-		 */
-		if ((!hashagg_avoid_disk_plan || hashaggtablesize < work_mem * 1024L) &&
-			cheapest_total_path != NULL)
-		{
-			add_path(partially_grouped_rel, (Path *)
-					 create_agg_path(root,
-									 partially_grouped_rel,
-									 cheapest_total_path,
-									 partially_grouped_rel->reltarget,
-									 AGG_HASHED,
-									 AGGSPLIT_INITIAL_SERIAL,
-									 parse->groupClause,
-									 NIL,
-									 agg_partial_costs,
-									 dNumPartialGroups));
-		}
+		add_path(partially_grouped_rel, (Path *)
+				 create_agg_path(root,
+								 partially_grouped_rel,
+								 cheapest_total_path,
+								 partially_grouped_rel->reltarget,
+								 AGG_HASHED,
+								 AGGSPLIT_INITIAL_SERIAL,
+								 parse->groupClause,
+								 NIL,
+								 agg_partial_costs,
+								 dNumPartialGroups));
 	}
 
+	/*
+	 * Now add a partially-grouped HashAgg partial Path where possible
+	 */
 	if (can_hash && cheapest_partial_path != NULL)
 	{
-		double		hashaggtablesize;
-
-		hashaggtablesize =
-			estimate_hashagg_tablesize(cheapest_partial_path,
-									   agg_partial_costs,
-									   dNumPartialPartialGroups);
-
-		/* Do the same for partial paths. */
-		if ((!hashagg_avoid_disk_plan ||
-			 hashaggtablesize < work_mem * 1024L) &&
-			cheapest_partial_path != NULL)
-		{
-			add_partial_path(partially_grouped_rel, (Path *)
-							 create_agg_path(root,
-											 partially_grouped_rel,
-											 cheapest_partial_path,
-											 partially_grouped_rel->reltarget,
-											 AGG_HASHED,
-											 AGGSPLIT_INITIAL_SERIAL,
-											 parse->groupClause,
-											 NIL,
-											 agg_partial_costs,
-											 dNumPartialPartialGroups));
-		}
+		add_partial_path(partially_grouped_rel, (Path *)
+						 create_agg_path(root,
+										 partially_grouped_rel,
+										 cheapest_partial_path,
+										 partially_grouped_rel->reltarget,
+										 AGG_HASHED,
+										 AGGSPLIT_INITIAL_SERIAL,
+										 parse->groupClause,
+										 NIL,
+										 agg_partial_costs,
+										 dNumPartialPartialGroups));
 	}
 
 	/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6f603cbbe8c83..abfa95a2314bf 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1006,16 +1006,6 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
-	{
-		{"hashagg_avoid_disk_plan", PGC_USERSET, QUERY_TUNING_METHOD,
-			gettext_noop("Causes the planner to avoid hashed aggregation plans that are expected to use the disk."),
-			NULL,
-			GUC_EXPLAIN
-		},
-		&hashagg_avoid_disk_plan,
-		false,
-		NULL, NULL, NULL
-	},
 	{
 		{"enable_material", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of materialization."),
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 613db8eab6882..6141654e4783a 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -55,7 +55,6 @@ extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
 extern PGDLLIMPORT bool enable_incremental_sort;
 extern PGDLLIMPORT bool enable_hashagg;
-extern PGDLLIMPORT bool hashagg_avoid_disk_plan;
 extern PGDLLIMPORT bool enable_nestloop;
 extern PGDLLIMPORT bool enable_material;
 extern PGDLLIMPORT bool enable_mergejoin;

From 13838740f61fc455aa4196d257efc0b761daba1f Mon Sep 17 00:00:00 2001
From: Etsuro Fujita <efujita@postgresql.org>
Date: Tue, 28 Jul 2020 11:00:00 +0900
Subject: [PATCH 21/54] Fix some issues with step generation in partition
 pruning.

In the case of range partitioning, get_steps_using_prefix() assumes that
the passed-in prefix list contains at least one clause for each of the
partition keys earlier than one specified in the passed-in
step_lastkeyno, but the caller (ie, gen_prune_steps_from_opexps())
didn't take it into account, which led to a server crash or incorrect
results when the list contained no clauses for such partition keys, as
reported in bug #16500 and #16501 from Kobayashi Hisanori.  Update the
caller to call that function only when the list created there contains
at least one clause for each of the earlier partition keys in the case
of range partitioning.

While at it, fix some other issues:

* The list to pass to get_steps_using_prefix() is allowed to contain
  multiple clauses for the same partition key, as described in the
  comment for that function, but that function actually assumed that the
  list contained just a single clause for each of middle partition keys,
  which led to an assertion failure when the list contained multiple
  clauses for such partition keys.  Update that function to match the
  comment.
* In the case of hash partitioning, partition keys are allowed to be
  NULL, in which case the list to pass to get_steps_using_prefix()
  contains no clauses for NULL partition keys, but that function treats
  that case as like the case of range partitioning, which led to the
  assertion failure.  Update the assertion test to take into account
  NULL partition keys in the case of hash partitioning.
* Fix a typo in a comment in get_steps_using_prefix_recurse().
* gen_partprune_steps() failed to detect self-contradiction from
  strict-qual clauses and an IS NULL clause for the same partition key
  in some cases, producing incorrect partition-pruning steps, which led
  to incorrect results of partition pruning, but didn't cause any
  user-visible problems fortunately, as the self-contradiction is
  detected later in the query planning.  Update that function to detect
  the self-contradiction.

Per bug #16500 and #16501 from Kobayashi Hisanori.  Patch by me, initial
diagnosis for the reported issue and review by Dmitry Dolgov.
Back-patch to v11, where partition pruning was introduced.

Discussion: https://postgr.es/m/16500-d1613f2a78e1e090%40postgresql.org
Discussion: https://postgr.es/m/16501-5234a9a0394f6754%40postgresql.org
---
 src/backend/partitioning/partprune.c          | 187 ++++++++++++------
 src/test/regress/expected/partition_prune.out |  92 +++++++++
 src/test/regress/sql/partition_prune.sql      |  71 +++++++
 3 files changed, 294 insertions(+), 56 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index badd31a44c3c1..253c690649824 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -1058,8 +1058,12 @@ gen_partprune_steps_internal(GeneratePruningStepsContext *context,
 				case PARTCLAUSE_MATCH_NULLNESS:
 					if (!clause_is_not_null)
 					{
-						/* check for conflicting IS NOT NULL */
-						if (bms_is_member(i, notnullkeys))
+						/*
+						 * check for conflicting IS NOT NULL as well as
+						 * contradicting strict clauses
+						 */
+						if (bms_is_member(i, notnullkeys) ||
+							keyclauses[i] != NIL)
 						{
 							context->contradictory = true;
 							return NIL;
@@ -1308,34 +1312,18 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 			{
 				case PARTITION_STRATEGY_LIST:
 				case PARTITION_STRATEGY_RANGE:
-					{
-						PartClauseInfo *last = NULL;
-
-						/*
-						 * Add this clause to the list of clauses to be used
-						 * for pruning if this is the first such key for this
-						 * operator strategy or if it is consecutively next to
-						 * the last column for which a clause with this
-						 * operator strategy was matched.
-						 */
-						if (btree_clauses[pc->op_strategy] != NIL)
-							last = llast(btree_clauses[pc->op_strategy]);
-
-						if (last == NULL ||
-							i == last->keyno || i == last->keyno + 1)
-							btree_clauses[pc->op_strategy] =
-								lappend(btree_clauses[pc->op_strategy], pc);
+					btree_clauses[pc->op_strategy] =
+						lappend(btree_clauses[pc->op_strategy], pc);
 
-						/*
-						 * We can't consider subsequent partition keys if the
-						 * clause for the current key contains a non-inclusive
-						 * operator.
-						 */
-						if (pc->op_strategy == BTLessStrategyNumber ||
-							pc->op_strategy == BTGreaterStrategyNumber)
-							consider_next_key = false;
-						break;
-					}
+					/*
+					 * We can't consider subsequent partition keys if the
+					 * clause for the current key contains a non-inclusive
+					 * operator.
+					 */
+					if (pc->op_strategy == BTLessStrategyNumber ||
+						pc->op_strategy == BTGreaterStrategyNumber)
+						consider_next_key = false;
+					break;
 
 				case PARTITION_STRATEGY_HASH:
 					if (pc->op_strategy != HTEqualStrategyNumber)
@@ -1374,6 +1362,7 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 				List	   *eq_clauses = btree_clauses[BTEqualStrategyNumber];
 				List	   *le_clauses = btree_clauses[BTLessEqualStrategyNumber];
 				List	   *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber];
+				bool		pk_has_clauses[PARTITION_MAX_KEYS];
 				int			strat;
 
 				/*
@@ -1396,6 +1385,35 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 						ListCell   *lc1;
 						List	   *prefix = NIL;
 						List	   *pc_steps;
+						bool		prefix_valid = true;
+
+						/*
+						 * If this is a clause for the first partition key,
+						 * there are no preceding expressions; generate a
+						 * pruning step without a prefix.
+						 *
+						 * Note that we pass NULL for step_nullkeys, because
+						 * we don't search list/range partition bounds where
+						 * some keys are NULL.
+						 */
+						if (pc->keyno == 0)
+						{
+							Assert(pc->op_strategy == strat);
+							pc_steps = get_steps_using_prefix(context, strat,
+															  pc->op_is_ne,
+															  pc->expr,
+															  pc->cmpfn,
+															  0,
+															  NULL,
+															  NIL);
+							opsteps = list_concat(opsteps, pc_steps);
+							continue;
+						}
+
+						/* (Re-)initialize the pk_has_clauses array */
+						Assert(pc->keyno > 0);
+						for (i = 0; i < pc->keyno; i++)
+							pk_has_clauses[i] = false;
 
 						/*
 						 * Expressions from = clauses can always be in the
@@ -1408,7 +1426,10 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 							if (eqpc->keyno == pc->keyno)
 								break;
 							if (eqpc->keyno < pc->keyno)
+							{
 								prefix = lappend(prefix, eqpc);
+								pk_has_clauses[eqpc->keyno] = true;
+							}
 						}
 
 						/*
@@ -1426,7 +1447,10 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 								if (lepc->keyno == pc->keyno)
 									break;
 								if (lepc->keyno < pc->keyno)
+								{
 									prefix = lappend(prefix, lepc);
+									pk_has_clauses[lepc->keyno] = true;
+								}
 							}
 						}
 
@@ -1445,11 +1469,33 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 								if (gepc->keyno == pc->keyno)
 									break;
 								if (gepc->keyno < pc->keyno)
+								{
 									prefix = lappend(prefix, gepc);
+									pk_has_clauses[gepc->keyno] = true;
+								}
 							}
 						}
 
 						/*
+						 * Check whether every earlier partition key has at
+						 * least one clause.
+						 */
+						for (i = 0; i < pc->keyno; i++)
+						{
+							if (!pk_has_clauses[i])
+							{
+								prefix_valid = false;
+								break;
+							}
+						}
+
+						/*
+						 * If prefix_valid, generate PartitionPruneStepOps.
+						 * Otherwise, we would not find clauses for a valid
+						 * subset of the partition keys anymore for the
+						 * strategy; give up on generating partition pruning
+						 * steps further for the strategy.
+						 *
 						 * As mentioned above, if 'prefix' contains multiple
 						 * expressions for the same key, the following will
 						 * generate multiple steps, one for each combination
@@ -1459,15 +1505,20 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 						 * we don't search list/range partition bounds where
 						 * some keys are NULL.
 						 */
-						Assert(pc->op_strategy == strat);
-						pc_steps = get_steps_using_prefix(context, strat,
-														  pc->op_is_ne,
-														  pc->expr,
-														  pc->cmpfn,
-														  pc->keyno,
-														  NULL,
-														  prefix);
-						opsteps = list_concat(opsteps, pc_steps);
+						if (prefix_valid)
+						{
+							Assert(pc->op_strategy == strat);
+							pc_steps = get_steps_using_prefix(context, strat,
+															  pc->op_is_ne,
+															  pc->expr,
+															  pc->cmpfn,
+															  pc->keyno,
+															  NULL,
+															  prefix);
+							opsteps = list_concat(opsteps, pc_steps);
+						}
+						else
+							break;
 					}
 				}
 				break;
@@ -2182,6 +2233,14 @@ match_clause_to_partition_key(GeneratePruningStepsContext *context,
  * 'prefix'.  Actually, since 'prefix' may contain multiple clauses for the
  * same partition key column, we must generate steps for various combinations
  * of the clauses of different keys.
+ *
+ * For list/range partitioning, callers must ensure that step_nullkeys is
+ * NULL, and that prefix contains at least one clause for each of the
+ * partition keys earlier than one specified in step_lastkeyno if it's
+ * greater than zero.  For hash partitioning, step_nullkeys is allowed to be
+ * non-NULL, but they must ensure that prefix contains at least one clause
+ * for each of the partition keys other than those specified in step_nullkeys
+ * and step_lastkeyno.
  */
 static List *
 get_steps_using_prefix(GeneratePruningStepsContext *context,
@@ -2193,6 +2252,9 @@ get_steps_using_prefix(GeneratePruningStepsContext *context,
 					   Bitmapset *step_nullkeys,
 					   List *prefix)
 {
+	Assert(step_nullkeys == NULL ||
+		   context->rel->part_scheme->strategy == PARTITION_STRATEGY_HASH);
+
 	/* Quick exit if there are no values to prefix with. */
 	if (list_length(prefix) == 0)
 	{
@@ -2261,7 +2323,7 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 		ListCell   *next_start;
 
 		/*
-		 * For each clause with cur_keyno, adds its expr and cmpfn to
+		 * For each clause with cur_keyno, add its expr and cmpfn to
 		 * step_exprs and step_cmpfns, respectively, and recurse after setting
 		 * next_start to the ListCell of the first clause for the next
 		 * partition key.
@@ -2278,23 +2340,19 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 		for_each_cell(lc, prefix, start)
 		{
 			List	   *moresteps;
+			List	   *step_exprs1,
+					   *step_cmpfns1;
 
 			pc = lfirst(lc);
 			if (pc->keyno == cur_keyno)
 			{
-				/* clean up before starting a new recursion cycle. */
-				if (cur_keyno == 0)
-				{
-					list_free(step_exprs);
-					list_free(step_cmpfns);
-					step_exprs = list_make1(pc->expr);
-					step_cmpfns = list_make1_oid(pc->cmpfn);
-				}
-				else
-				{
-					step_exprs = lappend(step_exprs, pc->expr);
-					step_cmpfns = lappend_oid(step_cmpfns, pc->cmpfn);
-				}
+				/* Leave the original step_exprs unmodified. */
+				step_exprs1 = list_copy(step_exprs);
+				step_exprs1 = lappend(step_exprs1, pc->expr);
+
+				/* Leave the original step_cmpfns unmodified. */
+				step_cmpfns1 = list_copy(step_cmpfns);
+				step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn);
 			}
 			else
 			{
@@ -2311,9 +2369,12 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 													   step_nullkeys,
 													   prefix,
 													   next_start,
-													   step_exprs,
-													   step_cmpfns);
+													   step_exprs1,
+													   step_cmpfns1);
 			result = list_concat(result, moresteps);
+
+			list_free(step_exprs1);
+			list_free(step_cmpfns1);
 		}
 	}
 	else
@@ -2321,9 +2382,23 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 		/*
 		 * End the current recursion cycle and start generating steps, one for
 		 * each clause with cur_keyno, which is all clauses from here onward
-		 * till the end of the list.
+		 * till the end of the list.  Note that for hash partitioning,
+		 * step_nullkeys is allowed to be non-empty, in which case step_exprs
+		 * would only contain expressions for the earlier partition keys that
+		 * are not specified in step_nullkeys.
+		 */
+		Assert(list_length(step_exprs) == cur_keyno ||
+			   !bms_is_empty(step_nullkeys));
+		/*
+		 * Note also that for hash partitioning, each partition key should
+		 * have either equality clauses or an IS NULL clause, so if a
+		 * partition key doesn't have an expression, it would be specified
+		 * in step_nullkeys.
 		 */
-		Assert(list_length(step_exprs) == cur_keyno);
+		Assert(context->rel->part_scheme->strategy
+			   != PARTITION_STRATEGY_HASH ||
+			   list_length(step_exprs) + 2 + bms_num_members(step_nullkeys) ==
+			   context->rel->part_scheme->partnatts);
 		for_each_cell(lc, prefix, start)
 		{
 			PartClauseInfo *pc = lfirst(lc);
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 4315e8e0a379b..687cf8c5f4154 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -3671,3 +3671,95 @@ explain (costs off) update listp1 set a = 1 where a = 2;
 reset constraint_exclusion;
 reset enable_partition_pruning;
 drop table listp;
+--
+-- Check that gen_prune_steps_from_opexps() works well for various cases of
+-- clauses for different partition keys
+--
+create table rp_prefix_test1 (a int, b varchar) partition by range(a, b);
+create table rp_prefix_test1_p1 partition of rp_prefix_test1 for values from (1, 'a') to (1, 'b');
+create table rp_prefix_test1_p2 partition of rp_prefix_test1 for values from (2, 'a') to (2, 'b');
+-- Don't call get_steps_using_prefix() with the last partition key b plus
+-- an empty prefix
+explain (costs off) select * from rp_prefix_test1 where a <= 1 and b = 'a';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Seq Scan on rp_prefix_test1_p1 rp_prefix_test1
+   Filter: ((a <= 1) AND ((b)::text = 'a'::text))
+(2 rows)
+
+create table rp_prefix_test2 (a int, b int, c int) partition by range(a, b, c);
+create table rp_prefix_test2_p1 partition of rp_prefix_test2 for values from (1, 1, 0) to (1, 1, 10);
+create table rp_prefix_test2_p2 partition of rp_prefix_test2 for values from (2, 2, 0) to (2, 2, 10);
+-- Don't call get_steps_using_prefix() with the last partition key c plus
+-- an invalid prefix (ie, b = 1)
+explain (costs off) select * from rp_prefix_test2 where a <= 1 and b = 1 and c >= 0;
+                   QUERY PLAN                   
+------------------------------------------------
+ Seq Scan on rp_prefix_test2_p1 rp_prefix_test2
+   Filter: ((a <= 1) AND (c >= 0) AND (b = 1))
+(2 rows)
+
+create table rp_prefix_test3 (a int, b int, c int, d int) partition by range(a, b, c, d);
+create table rp_prefix_test3_p1 partition of rp_prefix_test3 for values from (1, 1, 1, 0) to (1, 1, 1, 10);
+create table rp_prefix_test3_p2 partition of rp_prefix_test3 for values from (2, 2, 2, 0) to (2, 2, 2, 10);
+-- Test that get_steps_using_prefix() handles a prefix that contains multiple
+-- clauses for the partition key b (ie, b >= 1 and b >= 2)
+explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b >= 2 and c >= 2 and d >= 0;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Seq Scan on rp_prefix_test3_p2 rp_prefix_test3
+   Filter: ((a >= 1) AND (b >= 1) AND (b >= 2) AND (c >= 2) AND (d >= 0))
+(2 rows)
+
+create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops);
+create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0);
+create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1);
+-- Test that get_steps_using_prefix() handles non-NULL step_nullkeys
+explain (costs off) select * from hp_prefix_test where a = 1 and b is null and c = 1 and d = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Seq Scan on hp_prefix_test_p1 hp_prefix_test
+   Filter: ((b IS NULL) AND (a = 1) AND (c = 1) AND (d = 1))
+(2 rows)
+
+drop table rp_prefix_test1;
+drop table rp_prefix_test2;
+drop table rp_prefix_test3;
+drop table hp_prefix_test;
+--
+-- Check that gen_partprune_steps() detects self-contradiction from clauses
+-- regardless of the order of the clauses (Here we use a custom operator to
+-- prevent the equivclass.c machinery from reordering the clauses)
+--
+create operator === (
+   leftarg = int4,
+   rightarg = int4,
+   procedure = int4eq,
+   commutator = ===,
+   hashes
+);
+create operator class part_test_int4_ops2
+for type int4
+using hash as
+operator 1 ===,
+function 2 part_hashint4_noop(int4, int8);
+create table hp_contradict_test (a int, b int) partition by hash (a part_test_int4_ops2, b part_test_int4_ops2);
+create table hp_contradict_test_p1 partition of hp_contradict_test for values with (modulus 2, remainder 0);
+create table hp_contradict_test_p2 partition of hp_contradict_test for values with (modulus 2, remainder 1);
+explain (costs off) select * from hp_contradict_test where a is null and a === 1 and b === 1;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+explain (costs off) select * from hp_contradict_test where a === 1 and b === 1 and a is null;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+drop table hp_contradict_test;
+drop operator class part_test_int4_ops2 using hash;
+drop operator ===(int4, int4);
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 6658455a7461f..93ef9dc1f3407 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -1050,3 +1050,74 @@ reset constraint_exclusion;
 reset enable_partition_pruning;
 
 drop table listp;
+
+--
+-- Check that gen_prune_steps_from_opexps() works well for various cases of
+-- clauses for different partition keys
+--
+
+create table rp_prefix_test1 (a int, b varchar) partition by range(a, b);
+create table rp_prefix_test1_p1 partition of rp_prefix_test1 for values from (1, 'a') to (1, 'b');
+create table rp_prefix_test1_p2 partition of rp_prefix_test1 for values from (2, 'a') to (2, 'b');
+
+-- Don't call get_steps_using_prefix() with the last partition key b plus
+-- an empty prefix
+explain (costs off) select * from rp_prefix_test1 where a <= 1 and b = 'a';
+
+create table rp_prefix_test2 (a int, b int, c int) partition by range(a, b, c);
+create table rp_prefix_test2_p1 partition of rp_prefix_test2 for values from (1, 1, 0) to (1, 1, 10);
+create table rp_prefix_test2_p2 partition of rp_prefix_test2 for values from (2, 2, 0) to (2, 2, 10);
+
+-- Don't call get_steps_using_prefix() with the last partition key c plus
+-- an invalid prefix (ie, b = 1)
+explain (costs off) select * from rp_prefix_test2 where a <= 1 and b = 1 and c >= 0;
+
+create table rp_prefix_test3 (a int, b int, c int, d int) partition by range(a, b, c, d);
+create table rp_prefix_test3_p1 partition of rp_prefix_test3 for values from (1, 1, 1, 0) to (1, 1, 1, 10);
+create table rp_prefix_test3_p2 partition of rp_prefix_test3 for values from (2, 2, 2, 0) to (2, 2, 2, 10);
+
+-- Test that get_steps_using_prefix() handles a prefix that contains multiple
+-- clauses for the partition key b (ie, b >= 1 and b >= 2)
+explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b >= 2 and c >= 2 and d >= 0;
+
+create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops);
+create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0);
+create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1);
+
+-- Test that get_steps_using_prefix() handles non-NULL step_nullkeys
+explain (costs off) select * from hp_prefix_test where a = 1 and b is null and c = 1 and d = 1;
+
+drop table rp_prefix_test1;
+drop table rp_prefix_test2;
+drop table rp_prefix_test3;
+drop table hp_prefix_test;
+
+--
+-- Check that gen_partprune_steps() detects self-contradiction from clauses
+-- regardless of the order of the clauses (Here we use a custom operator to
+-- prevent the equivclass.c machinery from reordering the clauses)
+--
+
+create operator === (
+   leftarg = int4,
+   rightarg = int4,
+   procedure = int4eq,
+   commutator = ===,
+   hashes
+);
+create operator class part_test_int4_ops2
+for type int4
+using hash as
+operator 1 ===,
+function 2 part_hashint4_noop(int4, int8);
+
+create table hp_contradict_test (a int, b int) partition by hash (a part_test_int4_ops2, b part_test_int4_ops2);
+create table hp_contradict_test_p1 partition of hp_contradict_test for values with (modulus 2, remainder 0);
+create table hp_contradict_test_p2 partition of hp_contradict_test for values with (modulus 2, remainder 1);
+
+explain (costs off) select * from hp_contradict_test where a is null and a === 1 and b === 1;
+explain (costs off) select * from hp_contradict_test where a === 1 and b === 1 and a is null;
+
+drop table hp_contradict_test;
+drop operator class part_test_int4_ops2 using hash;
+drop operator ===(int4, int4);

From 45fdc9738b36d1068d3ad8fdb06436d6fd14436b Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Tue, 28 Jul 2020 08:06:44 +0530
Subject: [PATCH 22/54] Extend the logical decoding output plugin API with
 stream methods.

This adds seven methods to the output plugin API, adding support for
streaming changes of large in-progress transactions.

* stream_start
* stream_stop
* stream_abort
* stream_commit
* stream_change
* stream_message
* stream_truncate

Most of this is a simple extension of the existing methods, with
the semantic difference that the transaction (or subtransaction)
is incomplete and may be aborted later (which is something the
regular API does not really need to deal with).

This also extends the 'test_decoding' plugin, implementing these
new stream methods.

The stream_start/start_stop are used to demarcate a chunk of changes
streamed for a particular toplevel transaction.

This commit simply adds these new APIs and the upcoming patch to "allow
the streaming mode in ReorderBuffer" will use these APIs.

Author: Tomas Vondra, Dilip Kumar, Amit Kapila
Reviewed-by: Amit Kapila
Tested-by: Neha Sharma and Mahendra Singh Thalor
Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
---
 contrib/test_decoding/test_decoding.c     | 176 +++++++++++
 doc/src/sgml/logicaldecoding.sgml         | 218 ++++++++++++++
 src/backend/replication/logical/logical.c | 351 ++++++++++++++++++++++
 src/include/replication/logical.h         |   5 +
 src/include/replication/output_plugin.h   |  69 +++++
 src/include/replication/reorderbuffer.h   |  59 ++++
 6 files changed, 878 insertions(+)

diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c
index 93c948856e7b3..dbef52a3af470 100644
--- a/contrib/test_decoding/test_decoding.c
+++ b/contrib/test_decoding/test_decoding.c
@@ -62,6 +62,28 @@ static void pg_decode_message(LogicalDecodingContext *ctx,
 							  ReorderBufferTXN *txn, XLogRecPtr message_lsn,
 							  bool transactional, const char *prefix,
 							  Size sz, const char *message);
+static void pg_decode_stream_start(LogicalDecodingContext *ctx,
+								   ReorderBufferTXN *txn);
+static void pg_decode_stream_stop(LogicalDecodingContext *ctx,
+								  ReorderBufferTXN *txn);
+static void pg_decode_stream_abort(LogicalDecodingContext *ctx,
+								   ReorderBufferTXN *txn,
+								   XLogRecPtr abort_lsn);
+static void pg_decode_stream_commit(LogicalDecodingContext *ctx,
+									ReorderBufferTXN *txn,
+									XLogRecPtr commit_lsn);
+static void pg_decode_stream_change(LogicalDecodingContext *ctx,
+									ReorderBufferTXN *txn,
+									Relation relation,
+									ReorderBufferChange *change);
+static void pg_decode_stream_message(LogicalDecodingContext *ctx,
+									 ReorderBufferTXN *txn, XLogRecPtr message_lsn,
+									 bool transactional, const char *prefix,
+									 Size sz, const char *message);
+static void pg_decode_stream_truncate(LogicalDecodingContext *ctx,
+									  ReorderBufferTXN *txn,
+									  int nrelations, Relation relations[],
+									  ReorderBufferChange *change);
 
 void
 _PG_init(void)
@@ -83,6 +105,13 @@ _PG_output_plugin_init(OutputPluginCallbacks *cb)
 	cb->filter_by_origin_cb = pg_decode_filter;
 	cb->shutdown_cb = pg_decode_shutdown;
 	cb->message_cb = pg_decode_message;
+	cb->stream_start_cb = pg_decode_stream_start;
+	cb->stream_stop_cb = pg_decode_stream_stop;
+	cb->stream_abort_cb = pg_decode_stream_abort;
+	cb->stream_commit_cb = pg_decode_stream_commit;
+	cb->stream_change_cb = pg_decode_stream_change;
+	cb->stream_message_cb = pg_decode_stream_message;
+	cb->stream_truncate_cb = pg_decode_stream_truncate;
 }
 
 
@@ -540,3 +569,150 @@ pg_decode_message(LogicalDecodingContext *ctx,
 	appendBinaryStringInfo(ctx->out, message, sz);
 	OutputPluginWrite(ctx, true);
 }
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_start(LogicalDecodingContext *ctx,
+					   ReorderBufferTXN *txn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "opening a streamed block for transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "opening a streamed block for transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_stop(LogicalDecodingContext *ctx,
+					  ReorderBufferTXN *txn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "closing a streamed block for transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "closing a streamed block for transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_abort(LogicalDecodingContext *ctx,
+					   ReorderBufferTXN *txn,
+					   XLogRecPtr abort_lsn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "aborting streamed (sub)transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "aborting streamed (sub)transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_commit(LogicalDecodingContext *ctx,
+						ReorderBufferTXN *txn,
+						XLogRecPtr commit_lsn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "committing streamed transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "committing streamed transaction");
+
+	if (data->include_timestamp)
+		appendStringInfo(ctx->out, " (at %s)",
+						 timestamptz_to_str(txn->commit_time));
+
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * In streaming mode, we don't display the changes as the transaction can abort
+ * at a later point in time.  We don't want users to see the changes until the
+ * transaction is committed.
+ */
+static void
+pg_decode_stream_change(LogicalDecodingContext *ctx,
+						ReorderBufferTXN *txn,
+						Relation relation,
+						ReorderBufferChange *change)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "streaming change for TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "streaming change for transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * In streaming mode, we don't display the contents for transactional messages
+ * as the transaction can abort at a later point in time.  We don't want users to
+ * see the message contents until the transaction is committed.
+ */
+static void
+pg_decode_stream_message(LogicalDecodingContext *ctx,
+						 ReorderBufferTXN *txn, XLogRecPtr lsn, bool transactional,
+						 const char *prefix, Size sz, const char *message)
+{
+	OutputPluginPrepareWrite(ctx, true);
+
+	if (transactional)
+	{
+		appendStringInfo(ctx->out, "streaming message: transactional: %d prefix: %s, sz: %zu",
+						 transactional, prefix, sz);
+	}
+	else
+	{
+		appendStringInfo(ctx->out, "streaming message: transactional: %d prefix: %s, sz: %zu content:",
+						 transactional, prefix, sz);
+		appendBinaryStringInfo(ctx->out, message, sz);
+	}
+
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * In streaming mode, we don't display the detailed information of Truncate.
+ * See pg_decode_stream_change.
+ */
+static void
+pg_decode_stream_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+						  int nrelations, Relation relations[],
+						  ReorderBufferChange *change)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "streaming truncate for TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "streaming truncate for transaction");
+	OutputPluginWrite(ctx, true);
+}
diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml
index c89f93cf6bb55..791a62b57c9b5 100644
--- a/doc/src/sgml/logicaldecoding.sgml
+++ b/doc/src/sgml/logicaldecoding.sgml
@@ -389,6 +389,13 @@ typedef struct OutputPluginCallbacks
     LogicalDecodeMessageCB message_cb;
     LogicalDecodeFilterByOriginCB filter_by_origin_cb;
     LogicalDecodeShutdownCB shutdown_cb;
+    LogicalDecodeStreamStartCB stream_start_cb;
+    LogicalDecodeStreamStopCB stream_stop_cb;
+    LogicalDecodeStreamAbortCB stream_abort_cb;
+    LogicalDecodeStreamCommitCB stream_commit_cb;
+    LogicalDecodeStreamChangeCB stream_change_cb;
+    LogicalDecodeStreamMessageCB stream_message_cb;
+    LogicalDecodeStreamTruncateCB stream_truncate_cb;
 } OutputPluginCallbacks;
 
 typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb);
@@ -401,6 +408,15 @@ typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb);
      If <function>truncate_cb</function> is not set but a
      <command>TRUNCATE</command> is to be decoded, the action will be ignored.
     </para>
+
+    <para>
+     An output plugin may also define functions to support streaming of large,
+     in-progress transactions. The <function>stream_start_cb</function>,
+     <function>stream_stop_cb</function>, <function>stream_abort_cb</function>,
+     <function>stream_commit_cb</function> and <function>stream_change_cb</function>
+     are required, while <function>stream_message_cb</function> and
+     <function>stream_truncate_cb</function> are optional.
+    </para>
    </sect2>
 
    <sect2 id="logicaldecoding-capabilities">
@@ -679,6 +695,117 @@ typedef void (*LogicalDecodeMessageCB) (struct LogicalDecodingContext *ctx,
      </para>
     </sect3>
 
+    <sect3 id="logicaldecoding-output-plugin-stream-start">
+     <title>Stream Start Callback</title>
+     <para>
+      The <function>stream_start_cb</function> callback is called when opening
+      a block of streamed changes from an in-progress transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamStartCB) (struct LogicalDecodingContext *ctx,
+                                            ReorderBufferTXN *txn);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-stop">
+     <title>Stream Stop Callback</title>
+     <para>
+      The <function>stream_stop_cb</function> callback is called when closing
+      a block of streamed changes from an in-progress transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamStopCB) (struct LogicalDecodingContext *ctx,
+                                           ReorderBufferTXN *txn);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-abort">
+     <title>Stream Abort Callback</title>
+     <para>
+      The <function>stream_abort_cb</function> callback is called to abort
+      a previously streamed transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamAbortCB) (struct LogicalDecodingContext *ctx,
+                                            ReorderBufferTXN *txn,
+                                            XLogRecPtr abort_lsn);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-commit">
+     <title>Stream Commit Callback</title>
+     <para>
+      The <function>stream_commit_cb</function> callback is called to commit
+      a previously streamed transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamCommitCB) (struct LogicalDecodingContext *ctx,
+                                             ReorderBufferTXN *txn,
+                                             XLogRecPtr commit_lsn);
+</programlisting>
+     </para>
+    </sect3>
+    
+    <sect3 id="logicaldecoding-output-plugin-stream-change">
+     <title>Stream Change Callback</title>
+     <para>
+      The <function>stream_change_cb</function> callback is called when sending
+      a change in a block of streamed changes (demarcated by
+      <function>stream_start_cb</function> and <function>stream_stop_cb</function> calls).
+      The actual changes are not displayed as the transaction can abort at a later
+      point in time and we don't decode changes for aborted transactions.
+<programlisting>
+typedef void (*LogicalDecodeStreamChangeCB) (struct LogicalDecodingContext *ctx,
+                                             ReorderBufferTXN *txn,
+                                             Relation relation,
+                                             ReorderBufferChange *change);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-message">
+     <title>Stream Message Callback</title>
+     <para>
+      The <function>stream_message_cb</function> callback is called when sending
+      a generic message in a block of streamed changes (demarcated by
+      <function>stream_start_cb</function> and <function>stream_stop_cb</function> calls).
+      The message contents for transactional messages are not displayed as the transaction
+      can abort at a later point in time and we don't decode changes for aborted
+      transactions.
+<programlisting>
+typedef void (*LogicalDecodeStreamMessageCB) (struct LogicalDecodingContext *ctx,
+                                              ReorderBufferTXN *txn,
+                                              XLogRecPtr message_lsn,
+                                              bool transactional,
+                                              const char *prefix,
+                                              Size message_size,
+                                              const char *message);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-truncate">
+     <title>Stream Truncate Callback</title>
+     <para>
+      The <function>stream_truncate_cb</function> callback is called for a
+      <command>TRUNCATE</command> command in a block of streamed changes
+      (demarcated by <function>stream_start_cb</function> and
+      <function>stream_stop_cb</function> calls).
+<programlisting>
+typedef void (*LogicalDecodeStreamTruncateCB) (struct LogicalDecodingContext *ctx,
+                                               ReorderBufferTXN *txn,
+                                               int nrelations,
+                                               Relation relations[],
+                                               ReorderBufferChange *change);
+</programlisting>
+      The parameters are analogous to the <function>stream_change_cb</function>
+      callback.  However, because <command>TRUNCATE</command> actions on
+      tables connected by foreign keys need to be executed together, this
+      callback receives an array of relations instead of just a single one.
+      See the description of the <xref linkend="sql-truncate"/> statement for
+      details.
+     </para>
+    </sect3>
+
    </sect2>
 
    <sect2 id="logicaldecoding-output-plugin-output">
@@ -747,4 +874,95 @@ OutputPluginWrite(ctx, true);
      </para>
    </note>
   </sect1>
+
+  <sect1 id="logicaldecoding-streaming">
+   <title>Streaming of Large Transactions for Logical Decoding</title>
+
+   <para>
+    The basic output plugin callbacks (e.g. <function>begin_cb</function>,
+    <function>change_cb</function>, <function>commit_cb</function> and
+    <function>message_cb</function>) are only invoked when the transaction
+    actually commits. The changes are still decoded from the transaction
+    log, but are only passed to the output plugin at commit (and discarded
+    if the transaction aborts).
+   </para>
+
+   <para>
+    This means that while the decoding happens incrementally, and may spill
+    to disk to keep memory usage under control, all the decoded changes have
+    to be transmitted when the transaction finally commits (or more precisely,
+    when the commit is decoded from the transaction log). Depending on the
+    size of the transaction and network bandwidth, the transfer time may
+    significantly increase the apply lag.
+   </para>
+
+   <para>
+    To reduce the apply lag caused by large transactions, an output plugin
+    may provide additional callback to support incremental streaming of
+    in-progress transactions. There are multiple required streaming callbacks
+    (<function>stream_start_cb</function>, <function>stream_stop_cb</function>,
+    <function>stream_abort_cb</function>, <function>stream_commit_cb</function>
+    and <function>stream_change_cb</function>) and two optional callbacks
+    (<function>stream_message_cb</function>) and (<function>stream_truncate_cb</function>).
+   </para>
+
+   <para>
+    When streaming an in-progress transaction, the changes (and messages) are
+    streamed in blocks demarcated by <function>stream_start_cb</function>
+    and <function>stream_stop_cb</function> callbacks. Once all the decoded
+    changes are transmitted, the transaction is committed using the
+    <function>stream_commit_cb</function> callback (or possibly aborted using
+    the <function>stream_abort_cb</function> callback).
+   </para>
+
+   <para>
+    One example sequence of streaming callback calls for one transaction may
+    look like this:
+<programlisting>
+stream_start_cb(...);   &lt;-- start of first block of changes
+  stream_change_cb(...);
+  stream_change_cb(...);
+  stream_message_cb(...);
+  stream_change_cb(...);
+  ...
+  stream_change_cb(...);
+stream_stop_cb(...);    &lt;-- end of first block of changes
+
+stream_start_cb(...);   &lt;-- start of second block of changes
+  stream_change_cb(...);
+  stream_change_cb(...);
+  stream_change_cb(...);
+  ...
+  stream_message_cb(...);
+  stream_change_cb(...);
+stream_stop_cb(...);    &lt;-- end of second block of changes
+
+stream_commit_cb(...);  &lt;-- commit of the streamed transaction
+</programlisting>
+   </para>
+
+   <para>
+    The actual sequence of callback calls may be more complicated, of course.
+    There may be blocks for multiple streamed transactions, some of the
+    transactions may get aborted, etc.
+   </para>
+
+   <para>
+    Similar to spill-to-disk behavior, streaming is triggered when the total
+    amount of changes decoded from the WAL (for all in-progress transactions)
+    exceeds limit defined by <varname>logical_decoding_work_mem</varname> setting.
+    At that point the largest toplevel transaction (measured by amount of memory
+    currently used for decoded changes) is selected and streamed.  However, in
+    some cases we still have to spill to the disk even if streaming is enabled
+    because if we cross the memory limit but we still have not decoded the
+    complete tuple e.g. only decoded toast table insert but not the main table
+    insert.
+   </para>
+
+   <para>
+    Even when streaming large transactions, the changes are still applied in
+    commit order, preserving the same guarantees as the non-streaming mode.
+   </para>
+
+  </sect1>
  </chapter>
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 61902be3b0ed6..05d24b93da021 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -65,6 +65,23 @@ static void message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 							   XLogRecPtr message_lsn, bool transactional,
 							   const char *prefix, Size message_size, const char *message);
 
+/* streaming callbacks */
+static void stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									XLogRecPtr first_lsn);
+static void stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+								   XLogRecPtr last_lsn);
+static void stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									XLogRecPtr abort_lsn);
+static void stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									 XLogRecPtr commit_lsn);
+static void stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									 Relation relation, ReorderBufferChange *change);
+static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									  XLogRecPtr message_lsn, bool transactional,
+									  const char *prefix, Size message_size, const char *message);
+static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									   int nrelations, Relation relations[], ReorderBufferChange *change);
+
 static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin);
 
 /*
@@ -189,6 +206,39 @@ StartupDecodingContext(List *output_plugin_options,
 	ctx->reorder->commit = commit_cb_wrapper;
 	ctx->reorder->message = message_cb_wrapper;
 
+	/*
+	 * To support streaming, we require start/stop/abort/commit/change
+	 * callbacks. The message and truncate callbacks are optional, similar to
+	 * regular output plugins. We however enable streaming when at least one
+	 * of the methods is enabled so that we can easily identify missing
+	 * methods.
+	 *
+	 * We decide it here, but only check it later in the wrappers.
+	 */
+	ctx->streaming = (ctx->callbacks.stream_start_cb != NULL) ||
+		(ctx->callbacks.stream_stop_cb != NULL) ||
+		(ctx->callbacks.stream_abort_cb != NULL) ||
+		(ctx->callbacks.stream_commit_cb != NULL) ||
+		(ctx->callbacks.stream_change_cb != NULL) ||
+		(ctx->callbacks.stream_message_cb != NULL) ||
+		(ctx->callbacks.stream_truncate_cb != NULL);
+
+	/*
+	 * streaming callbacks
+	 *
+	 * stream_message and stream_truncate callbacks are optional, so we do not
+	 * fail with ERROR when missing, but the wrappers simply do nothing. We
+	 * must set the ReorderBuffer callbacks to something, otherwise the calls
+	 * from there will crash (we don't want to move the checks there).
+	 */
+	ctx->reorder->stream_start = stream_start_cb_wrapper;
+	ctx->reorder->stream_stop = stream_stop_cb_wrapper;
+	ctx->reorder->stream_abort = stream_abort_cb_wrapper;
+	ctx->reorder->stream_commit = stream_commit_cb_wrapper;
+	ctx->reorder->stream_change = stream_change_cb_wrapper;
+	ctx->reorder->stream_message = stream_message_cb_wrapper;
+	ctx->reorder->stream_truncate = stream_truncate_cb_wrapper;
+
 	ctx->out = makeStringInfo();
 	ctx->prepare_write = prepare_write;
 	ctx->write = do_write;
@@ -866,6 +916,307 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 	error_context_stack = errcallback.previous;
 }
 
+static void
+stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						XLogRecPtr first_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_start";
+	state.report_location = first_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this message's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = first_lsn;
+
+	/* in streaming mode, stream_start_cb is required */
+	if (ctx->callbacks.stream_start_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_start_cb callback")));
+
+	ctx->callbacks.stream_start_cb(ctx, txn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+					   XLogRecPtr last_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_stop";
+	state.report_location = last_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this message's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = last_lsn;
+
+	/* in streaming mode, stream_stop_cb is required */
+	if (ctx->callbacks.stream_stop_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_stop_cb callback")));
+
+	ctx->callbacks.stream_stop_cb(ctx, txn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						XLogRecPtr abort_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_abort";
+	state.report_location = abort_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = abort_lsn;
+
+	/* in streaming mode, stream_abort_cb is required */
+	if (ctx->callbacks.stream_abort_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_abort_cb callback")));
+
+	ctx->callbacks.stream_abort_cb(ctx, txn, abort_lsn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						 XLogRecPtr commit_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_commit";
+	state.report_location = txn->final_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = txn->end_lsn;
+
+	/* in streaming mode, stream_abort_cb is required */
+	if (ctx->callbacks.stream_commit_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_commit_cb callback")));
+
+	ctx->callbacks.stream_commit_cb(ctx, txn, commit_lsn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						 Relation relation, ReorderBufferChange *change)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_change";
+	state.report_location = change->lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this change's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = change->lsn;
+
+	/* in streaming mode, stream_change_cb is required */
+	if (ctx->callbacks.stream_change_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_change_cb callback")));
+
+	ctx->callbacks.stream_change_cb(ctx, txn, relation, change);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						  XLogRecPtr message_lsn, bool transactional,
+						  const char *prefix, Size message_size, const char *message)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* this callback is optional */
+	if (ctx->callbacks.stream_message_cb == NULL)
+		return;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_message";
+	state.report_location = message_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId;
+	ctx->write_location = message_lsn;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.stream_message_cb(ctx, txn, message_lsn, transactional, prefix,
+									 message_size, message);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						   int nrelations, Relation relations[],
+						   ReorderBufferChange *change)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* this callback is optional */
+	if (!ctx->callbacks.stream_truncate_cb)
+		return;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_truncate";
+	state.report_location = change->lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this change's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = change->lsn;
+
+	ctx->callbacks.stream_truncate_cb(ctx, txn, nrelations, relations, change);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
 /*
  * Set the required catalog xmin horizon for historic snapshots in the current
  * replication slot.
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
index c2f2475e5d3e5..deef31825d6e1 100644
--- a/src/include/replication/logical.h
+++ b/src/include/replication/logical.h
@@ -79,6 +79,11 @@ typedef struct LogicalDecodingContext
 	 */
 	void	   *output_writer_private;
 
+	/*
+	 * Does the output plugin support streaming, and is it enabled?
+	 */
+	bool		streaming;
+
 	/*
 	 * State for writing output.
 	 */
diff --git a/src/include/replication/output_plugin.h b/src/include/replication/output_plugin.h
index 3dd9236c576d4..b78c796450a18 100644
--- a/src/include/replication/output_plugin.h
+++ b/src/include/replication/output_plugin.h
@@ -99,6 +99,67 @@ typedef bool (*LogicalDecodeFilterByOriginCB) (struct LogicalDecodingContext *ct
  */
 typedef void (*LogicalDecodeShutdownCB) (struct LogicalDecodingContext *ctx);
 
+/*
+ * Called when starting to stream a block of changes from in-progress
+ * transaction (may be called repeatedly, if it's streamed in multiple
+ * chunks).
+ */
+typedef void (*LogicalDecodeStreamStartCB) (struct LogicalDecodingContext *ctx,
+											ReorderBufferTXN *txn);
+
+/*
+ * Called when stopping to stream a block of changes from in-progress
+ * transaction to a remote node (may be called repeatedly, if it's streamed
+ * in multiple chunks).
+ */
+typedef void (*LogicalDecodeStreamStopCB) (struct LogicalDecodingContext *ctx,
+										   ReorderBufferTXN *txn);
+
+/*
+ * Called to discard changes streamed to remote node from in-progress
+ * transaction.
+ */
+typedef void (*LogicalDecodeStreamAbortCB) (struct LogicalDecodingContext *ctx,
+											ReorderBufferTXN *txn,
+											XLogRecPtr abort_lsn);
+
+/*
+ * Called to apply changes streamed to remote node from in-progress
+ * transaction.
+ */
+typedef void (*LogicalDecodeStreamCommitCB) (struct LogicalDecodingContext *ctx,
+											 ReorderBufferTXN *txn,
+											 XLogRecPtr commit_lsn);
+
+/*
+ * Callback for streaming individual changes from in-progress transactions.
+ */
+typedef void (*LogicalDecodeStreamChangeCB) (struct LogicalDecodingContext *ctx,
+											 ReorderBufferTXN *txn,
+											 Relation relation,
+											 ReorderBufferChange *change);
+
+/*
+ * Callback for streaming generic logical decoding messages from in-progress
+ * transactions.
+ */
+typedef void (*LogicalDecodeStreamMessageCB) (struct LogicalDecodingContext *ctx,
+											  ReorderBufferTXN *txn,
+											  XLogRecPtr message_lsn,
+											  bool transactional,
+											  const char *prefix,
+											  Size message_size,
+											  const char *message);
+
+/*
+ * Callback for streaming truncates from in-progress transactions.
+ */
+typedef void (*LogicalDecodeStreamTruncateCB) (struct LogicalDecodingContext *ctx,
+											   ReorderBufferTXN *txn,
+											   int nrelations,
+											   Relation relations[],
+											   ReorderBufferChange *change);
+
 /*
  * Output plugin callbacks
  */
@@ -112,6 +173,14 @@ typedef struct OutputPluginCallbacks
 	LogicalDecodeMessageCB message_cb;
 	LogicalDecodeFilterByOriginCB filter_by_origin_cb;
 	LogicalDecodeShutdownCB shutdown_cb;
+	/* streaming of changes */
+	LogicalDecodeStreamStartCB stream_start_cb;
+	LogicalDecodeStreamStopCB stream_stop_cb;
+	LogicalDecodeStreamAbortCB stream_abort_cb;
+	LogicalDecodeStreamCommitCB stream_commit_cb;
+	LogicalDecodeStreamChangeCB stream_change_cb;
+	LogicalDecodeStreamMessageCB stream_message_cb;
+	LogicalDecodeStreamTruncateCB stream_truncate_cb;
 } OutputPluginCallbacks;
 
 /* Functions in replication/logical/logical.c */
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 1055e99e2e140..42bc817648739 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -348,6 +348,54 @@ typedef void (*ReorderBufferMessageCB) (ReorderBuffer *rb,
 										const char *prefix, Size sz,
 										const char *message);
 
+/* start streaming transaction callback signature */
+typedef void (*ReorderBufferStreamStartCB) (
+											ReorderBuffer *rb,
+											ReorderBufferTXN *txn,
+											XLogRecPtr first_lsn);
+
+/* stop streaming transaction callback signature */
+typedef void (*ReorderBufferStreamStopCB) (
+										   ReorderBuffer *rb,
+										   ReorderBufferTXN *txn,
+										   XLogRecPtr last_lsn);
+
+/* discard streamed transaction callback signature */
+typedef void (*ReorderBufferStreamAbortCB) (
+											ReorderBuffer *rb,
+											ReorderBufferTXN *txn,
+											XLogRecPtr abort_lsn);
+
+/* commit streamed transaction callback signature */
+typedef void (*ReorderBufferStreamCommitCB) (
+											 ReorderBuffer *rb,
+											 ReorderBufferTXN *txn,
+											 XLogRecPtr commit_lsn);
+
+/* stream change callback signature */
+typedef void (*ReorderBufferStreamChangeCB) (
+											 ReorderBuffer *rb,
+											 ReorderBufferTXN *txn,
+											 Relation relation,
+											 ReorderBufferChange *change);
+
+/* stream message callback signature */
+typedef void (*ReorderBufferStreamMessageCB) (
+											  ReorderBuffer *rb,
+											  ReorderBufferTXN *txn,
+											  XLogRecPtr message_lsn,
+											  bool transactional,
+											  const char *prefix, Size sz,
+											  const char *message);
+
+/* stream truncate callback signature */
+typedef void (*ReorderBufferStreamTruncateCB) (
+											   ReorderBuffer *rb,
+											   ReorderBufferTXN *txn,
+											   int nrelations,
+											   Relation relations[],
+											   ReorderBufferChange *change);
+
 struct ReorderBuffer
 {
 	/*
@@ -386,6 +434,17 @@ struct ReorderBuffer
 	ReorderBufferCommitCB commit;
 	ReorderBufferMessageCB message;
 
+	/*
+	 * Callbacks to be called when streaming a transaction.
+	 */
+	ReorderBufferStreamStartCB stream_start;
+	ReorderBufferStreamStopCB stream_stop;
+	ReorderBufferStreamAbortCB stream_abort;
+	ReorderBufferStreamCommitCB stream_commit;
+	ReorderBufferStreamChangeCB stream_change;
+	ReorderBufferStreamMessageCB stream_message;
+	ReorderBufferStreamTruncateCB stream_truncate;
+
 	/*
 	 * Pointer that will be passed untouched to the callbacks.
 	 */

From d7c8576ebe3949a644c700a9f54d88e7e373a647 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Tue, 28 Jul 2020 22:52:03 +1200
Subject: [PATCH 23/54] Doc: Improve documentation for pg_jit_available()

Per complaint from Scott Ribe. Based on wording suggestion from Tom Lane.

Discussion: https://postgr.es/m/1956E806-1468-4417-9A9D-235AE1D5FE1A@elevated-dev.com
Backpatch-through: 11, where pg_jit_available() was added
---
 doc/src/sgml/func.sgml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 959f6a1c2f255..f766c1bc67c14 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -21087,10 +21087,10 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
         <returnvalue>boolean</returnvalue>
        </para>
        <para>
-        Returns true if <acronym>JIT</acronym> compilation is available in
-        this session (see <xref linkend="jit"/>).
-        Returns false if <xref linkend="guc-jit"/> is set to false, or if the
-        feature was not enabled at compile time.
+        Returns true if a <acronym>JIT</acronym> compiler extension is
+        available (see <xref linkend="jit"/>) and the
+        <xref linkend="guc-jit"/> configuration parameter is set to
+        <literal>on</literal>.
        </para></entry>
       </row>
 

From 0e3e1c4e1cea68073132fe817fb3a98cb5c1b805 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Wed, 29 Jul 2020 11:42:21 +1200
Subject: [PATCH 24/54] Make EXPLAIN ANALYZE of HashAgg more similar to Hash
 Join

There were various unnecessary differences between Hash Agg's EXPLAIN
ANALYZE output and Hash Join's.  Here we modify the Hash Agg output so
that it's better aligned to Hash Join's.

The following changes have been made:
1. Start batches counter at 1 instead of 0.
2. Always display the "Batches" property, even when we didn't spill to
   disk.
3. Use the text "Batches" instead of "HashAgg Batches" for text format.
4. Use the text "Memory Usage" instead of "Peak Memory Usage" for text
   format.
5. Include "Batches" before "Memory Usage" in both text and non-text
   formats.

In passing also modify the "Planned Partitions" property so that we show
it regardless of if the value is 0 or not for non-text EXPLAIN formats.
This was pointed out by Justin Pryzby and probably should have been part
of 40efbf870.

Reviewed-by: Justin Pryzby, Jeff Davis
Discussion: https://postgr.es/m/CAApHDvrshRnA6C0VFnu7Fb9TVvgGo80PUMm5+2DiaS1gEkPvtw@mail.gmail.com
Backpatch-through: 13, where HashAgg batching was introduced
---
 src/backend/commands/explain.c | 35 +++++++++++++++++-----------------
 src/backend/executor/nodeAgg.c |  3 +++
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index a283e4d45c84b..54e3797a15b62 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3059,21 +3059,19 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
 
-		if (es->costs && aggstate->hash_planned_partitions > 0)
-		{
+		if (es->costs)
 			ExplainPropertyInteger("Planned Partitions", NULL,
 								   aggstate->hash_planned_partitions, es);
-		}
 
 		if (!es->analyze)
 			return;
 
 		/* EXPLAIN ANALYZE */
+		ExplainPropertyInteger("HashAgg Batches", NULL,
+							   aggstate->hash_batches_used, es);
 		ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
 		ExplainPropertyInteger("Disk Usage", "kB",
 							   aggstate->hash_disk_used, es);
-		ExplainPropertyInteger("HashAgg Batches", NULL,
-							   aggstate->hash_batches_used, es);
 	}
 	else
 	{
@@ -3099,13 +3097,13 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 		else
 			appendStringInfoString(es->str, "  ");
 
-		appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT "kB",
-						 memPeakKb);
+		appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
+						 aggstate->hash_batches_used, memPeakKb);
 
-		if (aggstate->hash_batches_used > 0)
-			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB  HashAgg Batches: %d",
-							 aggstate->hash_disk_used,
-							 aggstate->hash_batches_used);
+		/* Only display disk usage if we spilled to disk */
+		if (aggstate->hash_batches_used > 1)
+			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
+							 aggstate->hash_disk_used);
 		appendStringInfoChar(es->str, '\n');
 	}
 
@@ -3130,21 +3128,22 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 			{
 				ExplainIndentText(es);
 
-				appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT "kB",
-								 memPeakKb);
+				appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
+								 hash_batches_used, memPeakKb);
 
-				if (hash_batches_used > 0)
-					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB  HashAgg Batches: %d",
-									 hash_disk_used, hash_batches_used);
+				/* Only display disk usage if we spilled to disk */
+				if (hash_batches_used > 1)
+					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
+									 hash_disk_used);
 				appendStringInfoChar(es->str, '\n');
 			}
 			else
 			{
+				ExplainPropertyInteger("HashAgg Batches", NULL,
+									   hash_batches_used, es);
 				ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb,
 									   es);
 				ExplainPropertyInteger("Disk Usage", "kB", hash_disk_used, es);
-				ExplainPropertyInteger("HashAgg Batches", NULL,
-									   hash_batches_used, es);
 			}
 
 			if (es->workers_state)
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index bbfc4af1ec9cb..7d7bfa945605b 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -3641,6 +3641,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		find_hash_columns(aggstate);
 		build_hash_tables(aggstate);
 		aggstate->table_filled = false;
+
+		/* Initialize this to 1, meaning nothing spilled, yet */
+		aggstate->hash_batches_used = 1;
 	}
 
 	/*

From f36e82072c8866ba2eca08d88d1a5c3e0c3d1eb4 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Jul 2020 16:59:01 -0700
Subject: [PATCH 25/54] Doc: Remove obsolete CREATE AGGREGATE note.

The planner is in fact willing to use hash aggregation when work_mem is
not set high enough for everything to fit in memory.  This has been the
case since commit 1f39bce0, which added disk-based hash aggregation.

There are a few remaining cases in which hash aggregation is avoided as
a matter of policy when the planner surmises that spilling will be
necessary.  For example, callers of choose_hashed_setop() still
conservatively avoid hash aggregation when spilling is anticipated.
That doesn't seem like a good enough reason to mention hash aggregation
in this context.

Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 doc/src/sgml/ref/create_aggregate.sgml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/doc/src/sgml/ref/create_aggregate.sgml b/doc/src/sgml/ref/create_aggregate.sgml
index 811e288ec1efb..a315fff8bd3f6 100644
--- a/doc/src/sgml/ref/create_aggregate.sgml
+++ b/doc/src/sgml/ref/create_aggregate.sgml
@@ -386,10 +386,7 @@ SELECT col FROM tab ORDER BY col USING sortop LIMIT 1;
       If this parameter is omitted or is zero, a default estimate is used
       based on the <replaceable>state_data_type</replaceable>.
       The planner uses this value to estimate the memory required for a
-      grouped aggregate query.  The planner will consider using hash
-      aggregation for such a query only if the hash table is estimated to fit
-      in <xref linkend="guc-work-mem"/>; therefore, large values of this
-      parameter discourage use of hash aggregation.
+      grouped aggregate query.
      </para>
     </listitem>
    </varlistentry>

From b1d79127ed875f04720d2c4677a75f43528bfe08 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Jul 2020 17:14:07 -0700
Subject: [PATCH 26/54] Correct obsolete UNION hash aggs comment.

Oversight in commit 1f39bce0, which added disk-based hash aggregation.

Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 src/backend/optimizer/prep/prepunion.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 951aed80e7a20..6588f83d5ec6f 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -944,11 +944,10 @@ make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
 
 	/*
 	 * XXX for the moment, take the number of distinct groups as equal to the
-	 * total input size, ie, the worst case.  This is too conservative, but we
-	 * don't want to risk having the hashtable overrun memory; also, it's not
-	 * clear how to get a decent estimate of the true size.  One should note
-	 * as well the propensity of novices to write UNION rather than UNION ALL
-	 * even when they don't expect any duplicates...
+	 * total input size, ie, the worst case.  This is too conservative, but
+	 * it's not clear how to get a decent estimate of the true size.  One
+	 * should note as well the propensity of novices to write UNION rather
+	 * than UNION ALL even when they don't expect any duplicates...
 	 */
 	dNumGroups = path->rows;
 

From c49c74d19241b1fc8da6c215ebb40fd6b71c1bff Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Jul 2020 17:59:16 -0700
Subject: [PATCH 27/54] Rename another "hash_mem" local variable.

Missed by my commit 564ce621.

Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 src/backend/executor/nodeAgg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 7d7bfa945605b..586509c50b27e 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1839,15 +1839,15 @@ hash_agg_check_limits(AggState *aggstate)
 	uint64		ngroups = aggstate->hash_ngroups_current;
 	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
 													 true);
-	Size		hash_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
-													 true);
+	Size		hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+														true);
 
 	/*
 	 * Don't spill unless there's at least one group in the hash table so we
 	 * can be sure to make progress even in edge cases.
 	 */
 	if (aggstate->hash_ngroups_current > 0 &&
-		(meta_mem + hash_mem > aggstate->hash_mem_limit ||
+		(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
 		 ngroups > aggstate->hash_ngroups_limit))
 	{
 		hash_agg_enter_spill_mode(aggstate);

From cb04ad498551dcdb91a834c2e8730cdf0b77e70a Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Wed, 29 Jul 2020 16:46:58 +1200
Subject: [PATCH 28/54] Move syncscan.c to src/backend/access/common.

Since the tableam.c code needs to make use of the syncscan.c routines
itself, and since other block-oriented AMs might also want to use it one
day, it didn't make sense for it to live under src/backend/access/heap.

Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGLCnG%3DNEAByg6bk%2BCT9JZD97Y%3DAxKhh27Su9FeGWOKvDg%40mail.gmail.com
---
 src/backend/access/common/Makefile            |  1 +
 .../access/{heap => common}/syncscan.c        |  6 ++---
 src/backend/access/heap/Makefile              |  1 -
 src/backend/access/heap/heapam.c              |  1 +
 src/backend/access/heap/heapam_handler.c      |  1 +
 src/backend/access/table/tableam.c            |  2 +-
 src/backend/storage/ipc/ipci.c                |  1 +
 src/include/access/heapam.h                   |  6 -----
 src/include/access/syncscan.h                 | 25 +++++++++++++++++++
 9 files changed, 33 insertions(+), 11 deletions(-)
 rename src/backend/access/{heap => common}/syncscan.c (98%)
 create mode 100644 src/include/access/syncscan.h

diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile
index fd74e14024c30..5a007d63f1a95 100644
--- a/src/backend/access/common/Makefile
+++ b/src/backend/access/common/Makefile
@@ -24,6 +24,7 @@ OBJS = \
 	reloptions.o \
 	scankey.o \
 	session.o \
+	syncscan.o \
 	toast_internals.o \
 	tupconvert.o \
 	tupdesc.o
diff --git a/src/backend/access/heap/syncscan.c b/src/backend/access/common/syncscan.c
similarity index 98%
rename from src/backend/access/heap/syncscan.c
rename to src/backend/access/common/syncscan.c
index a32f6836f80c4..c1ce156902bef 100644
--- a/src/backend/access/heap/syncscan.c
+++ b/src/backend/access/common/syncscan.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * syncscan.c
- *	  heap scan synchronization support
+ *	  scan synchronization support
  *
  * When multiple backends run a sequential scan on the same table, we try
  * to keep them synchronized to reduce the overall I/O needed.  The goal is
@@ -40,13 +40,13 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  src/backend/access/heap/syncscan.c
+ *	  src/backend/access/common/syncscan.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include "access/heapam.h"
+#include "access/syncscan.h"
 #include "miscadmin.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile
index 51a7f5e0d01e9..af0bd1888e53f 100644
--- a/src/backend/access/heap/Makefile
+++ b/src/backend/access/heap/Makefile
@@ -20,7 +20,6 @@ OBJS = \
 	hio.o \
 	pruneheap.o \
 	rewriteheap.o \
-	syncscan.o \
 	vacuumlazy.o \
 	visibilitymap.o
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 2c9bb0c7ee248..8df2716de46cc 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -41,6 +41,7 @@
 #include "access/parallel.h"
 #include "access/relscan.h"
 #include "access/subtrans.h"
+#include "access/syncscan.h"
 #include "access/sysattr.h"
 #include "access/tableam.h"
 #include "access/transam.h"
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 8f2e5379210ca..267a6ee25a759 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -24,6 +24,7 @@
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
+#include "access/syncscan.h"
 #include "access/tableam.h"
 #include "access/tsmapi.h"
 #include "access/xact.h"
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 4e8553de2afc5..3afb63b1fe4db 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -21,7 +21,7 @@
 
 #include <math.h>
 
-#include "access/heapam.h"		/* for ss_* */
+#include "access/syncscan.h"
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "optimizer/plancat.h"
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 427b0d59cde2c..e850ebd131e3f 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -20,6 +20,7 @@
 #include "access/multixact.h"
 #include "access/nbtree.h"
 #include "access/subtrans.h"
+#include "access/syncscan.h"
 #include "access/twophase.h"
 #include "commands/async.h"
 #include "miscadmin.h"
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index f279edc47347f..b31de389106d8 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -182,12 +182,6 @@ extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *nowunused, int nunused);
 extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
-/* in heap/syncscan.c */
-extern void ss_report_location(Relation rel, BlockNumber location);
-extern BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks);
-extern void SyncScanShmemInit(void);
-extern Size SyncScanShmemSize(void);
-
 /* in heap/vacuumlazy.c */
 struct VacuumParams;
 extern void heap_vacuum_rel(Relation onerel,
diff --git a/src/include/access/syncscan.h b/src/include/access/syncscan.h
new file mode 100644
index 0000000000000..7cbf63c399fe5
--- /dev/null
+++ b/src/include/access/syncscan.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * syncscan.h
+ *    POSTGRES synchronous scan support functions.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/syncscan.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SYNCSCAN_H
+#define SYNCSCAN_H
+
+#include "storage/block.h"
+#include "utils/relcache.h"
+
+extern void ss_report_location(Relation rel, BlockNumber location);
+extern BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks);
+extern void SyncScanShmemInit(void);
+extern Size SyncScanShmemSize(void);
+
+#endif

From f2130e77da51f35d37fd15a343bc1c4a4527e0fd Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 29 Jul 2020 14:44:32 +0900
Subject: [PATCH 29/54] Fix incorrect print format in json.c

Oid is unsigned, so %u needs to be used and not %d.  The code path
involved here is not normally reachable, so no backpatch is done.

Author: Justin Pryzby
Discussion: https://postgr.es/m/20200728015523.GA27308@telsasoft.com
---
 src/backend/utils/adt/json.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index 641ae3fdf8e3a..a7a91b72f69b2 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -458,7 +458,7 @@ JsonEncodeDateTime(char *buf, Datum value, Oid typid, const int *tzp)
 			}
 			break;
 		default:
-			elog(ERROR, "unknown jsonb value datetime type oid %d", typid);
+			elog(ERROR, "unknown jsonb value datetime type oid %u", typid);
 			return NULL;
 	}
 

From 9878b643f37b1e4167f64a9941244bfabed60623 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Tue, 28 Jul 2020 23:15:47 -0700
Subject: [PATCH 30/54] HashAgg: use better cardinality estimate for recursive
 spilling.

Use HyperLogLog to estimate the group cardinality in a spilled
partition. This estimate is used to choose the number of partitions if
we recurse.

The previous behavior was to use the number of tuples in a spilled
partition as the estimate for the number of groups, which lead to
overpartitioning. That could cause the number of batches to be much
higher than expected (with each batch being very small), which made it
harder to interpret EXPLAIN ANALYZE results.

Reviewed-by: Peter Geoghegan
Discussion: https://postgr.es/m/a856635f9284bc36f7a77d02f47bbb6aaf7b59b3.camel@j-davis.com
Backpatch-through: 13
---
 src/backend/executor/nodeAgg.c | 64 ++++++++++++++++++++++------------
 src/include/executor/nodeAgg.h |  2 +-
 2 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 586509c50b27e..02a9165c6941e 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -245,9 +245,11 @@
 #include "catalog/pg_aggregate.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "common/hashfn.h"
 #include "executor/execExpr.h"
 #include "executor/executor.h"
 #include "executor/nodeAgg.h"
+#include "lib/hyperloglog.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
@@ -295,6 +297,14 @@
 #define HASHAGG_READ_BUFFER_SIZE BLCKSZ
 #define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
 
+/*
+ * HyperLogLog is used for estimating the cardinality of the spilled tuples in
+ * a given partition. 5 bits corresponds to a size of about 32 bytes and a
+ * worst-case error of around 18%. That's effective enough to choose a
+ * reasonable number of partitions when recursing.
+ */
+#define HASHAGG_HLL_BIT_WIDTH 5
+
 /*
  * Estimate chunk overhead as a constant 16 bytes. XXX: should this be
  * improved?
@@ -339,6 +349,7 @@ typedef struct HashAggSpill
 	int64	   *ntuples;		/* number of tuples in each partition */
 	uint32		mask;			/* mask to find partition from hash value */
 	int			shift;			/* after masking, shift by this amount */
+	hyperLogLogState *hll_card;	/* cardinality estimate for contents */
 } HashAggSpill;
 
 /*
@@ -357,6 +368,7 @@ typedef struct HashAggBatch
 	LogicalTapeSet *tapeset;	/* borrowed reference to tape set */
 	int			input_tapenum;	/* input partition tape */
 	int64		input_tuples;	/* number of tuples in this batch */
+	double		input_card;		/* estimated group cardinality */
 } HashAggBatch;
 
 /* used to find referenced colnos */
@@ -411,7 +423,7 @@ static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
 static long hash_choose_num_buckets(double hashentrysize,
 									long estimated_nbuckets,
 									Size memory);
-static int	hash_choose_num_partitions(uint64 input_groups,
+static int	hash_choose_num_partitions(double input_groups,
 									   double hashentrysize,
 									   int used_bits,
 									   int *log2_npartittions);
@@ -432,10 +444,11 @@ static void hashagg_finish_initial_spills(AggState *aggstate);
 static void hashagg_reset_spill_state(AggState *aggstate);
 static HashAggBatch *hashagg_batch_new(LogicalTapeSet *tapeset,
 									   int input_tapenum, int setno,
-									   int64 input_tuples, int used_bits);
+									   int64 input_tuples, double input_card,
+									   int used_bits);
 static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
 static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
-							   int used_bits, uint64 input_tuples,
+							   int used_bits, double input_groups,
 							   double hashentrysize);
 static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
 								TupleTableSlot *slot, uint32 hash);
@@ -1777,7 +1790,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
  * substantially larger than the initial value.
  */
 void
-hash_agg_set_limits(double hashentrysize, uint64 input_groups, int used_bits,
+hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 					Size *mem_limit, uint64 *ngroups_limit,
 					int *num_partitions)
 {
@@ -1969,7 +1982,7 @@ hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
  * *log2_npartitions to the log2() of the number of partitions.
  */
 static int
-hash_choose_num_partitions(uint64 input_groups, double hashentrysize,
+hash_choose_num_partitions(double input_groups, double hashentrysize,
 						   int used_bits, int *log2_npartitions)
 {
 	Size		mem_wanted;
@@ -2574,7 +2587,6 @@ agg_refill_hash_table(AggState *aggstate)
 	AggStatePerHash perhash;
 	HashAggSpill spill;
 	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
-	uint64		ngroups_estimate;
 	bool		spill_initialized = false;
 
 	if (aggstate->hash_batches == NIL)
@@ -2583,16 +2595,7 @@ agg_refill_hash_table(AggState *aggstate)
 	batch = linitial(aggstate->hash_batches);
 	aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
 
-	/*
-	 * Estimate the number of groups for this batch as the total number of
-	 * tuples in its input file. Although that's a worst case, it's not bad
-	 * here for two reasons: (1) overestimating is better than
-	 * underestimating; and (2) we've already scanned the relation once, so
-	 * it's likely that we've already finalized many of the common values.
-	 */
-	ngroups_estimate = batch->input_tuples;
-
-	hash_agg_set_limits(aggstate->hashentrysize, ngroups_estimate,
+	hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
 						batch->used_bits, &aggstate->hash_mem_limit,
 						&aggstate->hash_ngroups_limit, NULL);
 
@@ -2678,7 +2681,7 @@ agg_refill_hash_table(AggState *aggstate)
 				 */
 				spill_initialized = true;
 				hashagg_spill_init(&spill, tapeinfo, batch->used_bits,
-								   ngroups_estimate, aggstate->hashentrysize);
+								   batch->input_card, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
 			hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
@@ -2936,7 +2939,7 @@ hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum)
  */
 static void
 hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
-				   uint64 input_groups, double hashentrysize)
+				   double input_groups, double hashentrysize)
 {
 	int			npartitions;
 	int			partition_bits;
@@ -2946,6 +2949,7 @@ hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
 
 	spill->partitions = palloc0(sizeof(int) * npartitions);
 	spill->ntuples = palloc0(sizeof(int64) * npartitions);
+	spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
 
 	hashagg_tapeinfo_assign(tapeinfo, spill->partitions, npartitions);
 
@@ -2953,6 +2957,9 @@ hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
 	spill->shift = 32 - used_bits - partition_bits;
 	spill->mask = (npartitions - 1) << spill->shift;
 	spill->npartitions = npartitions;
+
+	for (int i = 0; i < npartitions; i++)
+		initHyperLogLog(&spill->hll_card[i], HASHAGG_HLL_BIT_WIDTH);
 }
 
 /*
@@ -3001,6 +3008,13 @@ hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
 	partition = (hash & spill->mask) >> spill->shift;
 	spill->ntuples[partition]++;
 
+	/*
+	 * All hash values destined for a given partition have some bits in
+	 * common, which causes bad HLL cardinality estimates. Hash the hash to
+	 * get a more uniform distribution.
+	 */
+	addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
+
 	tapenum = spill->partitions[partition];
 
 	LogicalTapeWrite(tapeset, tapenum, (void *) &hash, sizeof(uint32));
@@ -3023,7 +3037,7 @@ hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
  */
 static HashAggBatch *
 hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
-				  int64 input_tuples, int used_bits)
+				  int64 input_tuples, double input_card, int used_bits)
 {
 	HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
 
@@ -3032,6 +3046,7 @@ hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
 	batch->tapeset = tapeset;
 	batch->input_tapenum = tapenum;
 	batch->input_tuples = input_tuples;
+	batch->input_card = input_card;
 
 	return batch;
 }
@@ -3135,21 +3150,26 @@ hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
 
 	for (i = 0; i < spill->npartitions; i++)
 	{
-		int			tapenum = spill->partitions[i];
-		HashAggBatch *new_batch;
+		int				 tapenum = spill->partitions[i];
+		HashAggBatch	*new_batch;
+		double			 cardinality;
 
 		/* if the partition is empty, don't create a new batch of work */
 		if (spill->ntuples[i] == 0)
 			continue;
 
+		cardinality = estimateHyperLogLog(&spill->hll_card[i]);
+		freeHyperLogLog(&spill->hll_card[i]);
+
 		new_batch = hashagg_batch_new(aggstate->hash_tapeinfo->tapeset,
 									  tapenum, setno, spill->ntuples[i],
-									  used_bits);
+									  cardinality, used_bits);
 		aggstate->hash_batches = lcons(new_batch, aggstate->hash_batches);
 		aggstate->hash_batches_used++;
 	}
 
 	pfree(spill->ntuples);
+	pfree(spill->hll_card);
 	pfree(spill->partitions);
 }
 
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index bb0805abe091a..b955169538444 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -320,7 +320,7 @@ extern void ExecReScanAgg(AggState *node);
 
 extern Size hash_agg_entry_size(int numTrans, Size tupleWidth,
 								Size transitionSpace);
-extern void hash_agg_set_limits(double hashentrysize, uint64 input_groups,
+extern void hash_agg_set_limits(double hashentrysize, double input_groups,
 								int used_bits, Size *mem_limit,
 								uint64 *ngroups_limit, int *num_partitions);
 

From b5310e4ff6b7b0b14a5ee2443839fbf3553623ea Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 29 Jul 2020 21:24:26 +0900
Subject: [PATCH 31/54] Remove non-fast promotion.

When fast promotion was supported in 9.3, non-fast promotion became
undocumented feature and it's basically not available for ordinary users.
However we decided not to remove non-fast promotion at that moment,
to leave it for a release or two for debugging purpose or as an emergency
method because fast promotion might have some issues, and then to
remove it later. Now, several versions were released since that decision
and there is no longer reason to keep supporting non-fast promotion.
Therefore this commit removes non-fast promotion.

Author: Fujii Masao
Reviewed-by: Hamid Akhtar, Kyotaro Horiguchi
Discussion: https://postgr.es/m/76066434-648f-f567-437b-54853b43398f@oss.nttdata.com
---
 src/backend/access/transam/xlog.c   | 48 +++++++----------------------
 src/backend/postmaster/postmaster.c |  7 ++++-
 src/bin/pg_ctl/pg_ctl.c             |  5 ---
 src/include/access/xlog.h           |  1 -
 4 files changed, 17 insertions(+), 44 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 184c6672f3be0..756b838e6a543 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -299,9 +299,6 @@ bool		wal_receiver_create_temp_slot = false;
 /* are we currently in standby mode? */
 bool		StandbyMode = false;
 
-/* whether request for fast promotion has been made yet */
-static bool fast_promote = false;
-
 /*
  * if recoveryStopsBefore/After returns true, it saves information of the stop
  * point here
@@ -6322,7 +6319,7 @@ StartupXLOG(void)
 	DBState		dbstate_at_startup;
 	XLogReaderState *xlogreader;
 	XLogPageReadPrivate private;
-	bool		fast_promoted = false;
+	bool		promoted = false;
 	struct stat st;
 
 	/*
@@ -7727,14 +7724,14 @@ StartupXLOG(void)
 		 * the rule that TLI only changes in shutdown checkpoints, which
 		 * allows some extra error checking in xlog_redo.
 		 *
-		 * In fast promotion, only create a lightweight end-of-recovery record
+		 * In promotion, only create a lightweight end-of-recovery record
 		 * instead of a full checkpoint. A checkpoint is requested later,
 		 * after we're fully out of recovery mode and already accepting
 		 * queries.
 		 */
 		if (bgwriterLaunched)
 		{
-			if (fast_promote)
+			if (LocalPromoteIsTriggered)
 			{
 				checkPointLoc = ControlFile->checkPoint;
 
@@ -7745,7 +7742,7 @@ StartupXLOG(void)
 				record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
 				if (record != NULL)
 				{
-					fast_promoted = true;
+					promoted = true;
 
 					/*
 					 * Insert a special WAL record to mark the end of
@@ -7762,7 +7759,7 @@ StartupXLOG(void)
 				}
 			}
 
-			if (!fast_promoted)
+			if (!promoted)
 				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
 								  CHECKPOINT_IMMEDIATE |
 								  CHECKPOINT_WAIT);
@@ -7953,12 +7950,12 @@ StartupXLOG(void)
 	WalSndWakeup();
 
 	/*
-	 * If this was a fast promotion, request an (online) checkpoint now. This
+	 * If this was a promotion, request an (online) checkpoint now. This
 	 * isn't required for consistency, but the last restartpoint might be far
 	 * back, and in case of a crash, recovering from it might take a longer
 	 * than is appropriate now that we're not in standby mode anymore.
 	 */
-	if (fast_promoted)
+	if (promoted)
 		RequestCheckpoint(CHECKPOINT_FORCE);
 }
 
@@ -12592,29 +12589,10 @@ CheckForStandbyTrigger(void)
 	if (LocalPromoteIsTriggered)
 		return true;
 
-	if (IsPromoteSignaled())
+	if (IsPromoteSignaled() && CheckPromoteSignal())
 	{
-		/*
-		 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
-		 * signal handler. It now leaves the file in place and lets the
-		 * Startup process do the unlink. This allows Startup to know whether
-		 * it should create a full checkpoint before starting up (fallback
-		 * mode). Fast promotion takes precedence.
-		 */
-		if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-		{
-			unlink(PROMOTE_SIGNAL_FILE);
-			unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
-			fast_promote = true;
-		}
-		else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-		{
-			unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
-			fast_promote = false;
-		}
-
 		ereport(LOG, (errmsg("received promote request")));
-
+		RemovePromoteSignalFiles();
 		ResetPromoteSignaled();
 		SetPromoteIsTriggered();
 		return true;
@@ -12629,7 +12607,6 @@ CheckForStandbyTrigger(void)
 				(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
 		unlink(PromoteTriggerFile);
 		SetPromoteIsTriggered();
-		fast_promote = true;
 		return true;
 	}
 	else if (errno != ENOENT)
@@ -12648,20 +12625,17 @@ void
 RemovePromoteSignalFiles(void)
 {
 	unlink(PROMOTE_SIGNAL_FILE);
-	unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
 }
 
 /*
- * Check to see if a promote request has arrived. Should be
- * called by postmaster after receiving SIGUSR1.
+ * Check to see if a promote request has arrived.
  */
 bool
 CheckPromoteSignal(void)
 {
 	struct stat stat_buf;
 
-	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
-		stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
 		return true;
 
 	return false;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index dec02586c7f12..1db6a3d29d01e 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -5333,7 +5333,12 @@ sigusr1_handler(SIGNAL_ARGS)
 		 pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) &&
 		CheckPromoteSignal())
 	{
-		/* Tell startup process to finish recovery */
+		/*
+		 * Tell startup process to finish recovery.
+		 *
+		 * Leave the promote signal file in place and let the Startup
+		 * process do the unlink.
+		 */
 		signal_child(StartupPID, SIGUSR2);
 	}
 
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 3c03ace7ed6ba..1cdc3ebaa338d 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1195,11 +1195,6 @@ do_promote(void)
 		exit(1);
 	}
 
-	/*
-	 * For 9.3 onwards, "fast" promotion is performed. Promotion with a full
-	 * checkpoint is still possible by writing a file called
-	 * "fallback_promote" instead of "promote"
-	 */
 	snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
 
 	if ((prmfile = fopen(promote_file, "w")) == NULL)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 219a7299e1f0e..221af87e715b8 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -394,6 +394,5 @@ extern SessionBackupState get_backup_status(void);
 
 /* files to signal promotion to primary */
 #define PROMOTE_SIGNAL_FILE		"promote"
-#define FALLBACK_PROMOTE_SIGNAL_FILE  "fallback_promote"
 
 #endif							/* XLOG_H */

From 6023b7ea717ca04cf1bd53709d9c862db07eaefb Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 29 Jul 2020 23:21:55 +0900
Subject: [PATCH 32/54] pg_stat_statements: track number of rows processed by
 some utility commands.

This commit makes pg_stat_statements track the total number
of rows retrieved or affected by CREATE TABLE AS, SELECT INTO,
CREATE MATERIALIZED VIEW and FETCH commands.

Suggested-by: Pascal Legrand
Author: Fujii Masao
Reviewed-by: Asif Rehman
Discussion: https://postgr.es/m/1584293755198-0.post@n3.nabble.com
---
 .../expected/pg_stat_statements.out           | 66 +++++++++++++++++++
 .../pg_stat_statements/pg_stat_statements.c   | 10 ++-
 .../sql/pg_stat_statements.sql                | 27 ++++++++
 3 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/contrib/pg_stat_statements/expected/pg_stat_statements.out b/contrib/pg_stat_statements/expected/pg_stat_statements.out
index c3f013860ae3b..e0edb134f3dca 100644
--- a/contrib/pg_stat_statements/expected/pg_stat_statements.out
+++ b/contrib/pg_stat_statements/expected/pg_stat_statements.out
@@ -528,6 +528,69 @@ SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
  SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C" |     0 |    0
 (9 rows)
 
+--
+-- Track the total number of rows retrieved or affected by the utility
+-- commands of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED VIEW
+-- and SELECT INTO
+--
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+CREATE TABLE pgss_ctas AS SELECT a, 'ctas' b FROM generate_series(1, 10) a;
+SELECT generate_series(1, 10) c INTO pgss_select_into;
+COPY pgss_ctas (a, b) FROM STDIN;
+CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas;
+BEGIN;
+DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv;
+FETCH NEXT pgss_cursor;
+ a |  b   
+---+------
+ 1 | ctas
+(1 row)
+
+FETCH FORWARD 5 pgss_cursor;
+ a |  b   
+---+------
+ 2 | ctas
+ 3 | ctas
+ 4 | ctas
+ 5 | ctas
+ 6 | ctas
+(5 rows)
+
+FETCH FORWARD ALL pgss_cursor;
+ a  |  b   
+----+------
+  7 | ctas
+  8 | ctas
+  9 | ctas
+ 10 | ctas
+ 11 | copy
+ 12 | copy
+ 13 | copy
+(7 rows)
+
+COMMIT;
+SELECT query, plans, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                        query                                        | plans | calls | rows 
+-------------------------------------------------------------------------------------+-------+-------+------
+ BEGIN                                                                               |     0 |     1 |    0
+ COMMIT                                                                              |     0 |     1 |    0
+ COPY pgss_ctas (a, b) FROM STDIN                                                    |     0 |     1 |    3
+ CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas                       |     0 |     1 |   13
+ CREATE TABLE pgss_ctas AS SELECT a, 'ctas' b FROM generate_series(1, 10) a          |     0 |     1 |   10
+ DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv                              |     0 |     1 |    0
+ FETCH FORWARD 5 pgss_cursor                                                         |     0 |     1 |    5
+ FETCH FORWARD ALL pgss_cursor                                                       |     0 |     1 |    7
+ FETCH NEXT pgss_cursor                                                              |     0 |     1 |    1
+ SELECT generate_series(1, 10) c INTO pgss_select_into                               |     0 |     1 |   10
+ SELECT pg_stat_statements_reset()                                                   |     0 |     1 |    1
+ SELECT query, plans, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C" |     1 |     0 |    0
+(12 rows)
+
 --
 -- Track user activity and reset them
 --
@@ -728,6 +791,9 @@ SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
 --
 DROP ROLE regress_stats_user1;
 DROP ROLE regress_stats_user2;
+DROP MATERIALIZED VIEW pgss_matv;
+DROP TABLE pgss_ctas;
+DROP TABLE pgss_select_into;
 --
 -- [re]plan counting
 --
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 14cad19afbc5a..6b91c62c31a88 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -1170,7 +1170,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		INSTR_TIME_SET_CURRENT(duration);
 		INSTR_TIME_SUBTRACT(duration, start);
 
-		rows = (qc && qc->commandTag == CMDTAG_COPY) ? qc->nprocessed : 0;
+		/*
+		 * Track the total number of rows retrieved or affected by
+		 * the utility statements of COPY, FETCH, CREATE TABLE AS,
+		 * CREATE MATERIALIZED VIEW and SELECT INTO.
+		 */
+		rows = (qc && (qc->commandTag == CMDTAG_COPY ||
+					   qc->commandTag == CMDTAG_FETCH ||
+					   qc->commandTag == CMDTAG_SELECT)) ?
+			qc->nprocessed : 0;
 
 		/* calc differences of buffer counters. */
 		memset(&bufusage, 0, sizeof(BufferUsage));
diff --git a/contrib/pg_stat_statements/sql/pg_stat_statements.sql b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
index 6ed8e38028021..996a24a293c5b 100644
--- a/contrib/pg_stat_statements/sql/pg_stat_statements.sql
+++ b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
@@ -250,6 +250,30 @@ DROP FUNCTION PLUS_TWO(INTEGER);
 
 SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
 
+--
+-- Track the total number of rows retrieved or affected by the utility
+-- commands of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED VIEW
+-- and SELECT INTO
+--
+SELECT pg_stat_statements_reset();
+
+CREATE TABLE pgss_ctas AS SELECT a, 'ctas' b FROM generate_series(1, 10) a;
+SELECT generate_series(1, 10) c INTO pgss_select_into;
+COPY pgss_ctas (a, b) FROM STDIN;
+11	copy
+12	copy
+13	copy
+\.
+CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas;
+BEGIN;
+DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv;
+FETCH NEXT pgss_cursor;
+FETCH FORWARD 5 pgss_cursor;
+FETCH FORWARD ALL pgss_cursor;
+COMMIT;
+
+SELECT query, plans, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
+
 --
 -- Track user activity and reset them
 --
@@ -313,6 +337,9 @@ SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
 --
 DROP ROLE regress_stats_user1;
 DROP ROLE regress_stats_user2;
+DROP MATERIALIZED VIEW pgss_matv;
+DROP TABLE pgss_ctas;
+DROP TABLE pgss_select_into;
 
 --
 -- [re]plan counting

From d6c08e29e7bc8bc3bf49764192c4a9c71fc0b097 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 29 Jul 2020 14:14:58 -0700
Subject: [PATCH 33/54] Add hash_mem_multiplier GUC.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a GUC that acts as a multiplier on work_mem.  It gets applied when
sizing executor node hash tables that were previously size constrained
using work_mem alone.

The new GUC can be used to preferentially give hash-based nodes more
memory than the generic work_mem limit.  It is intended to enable admin
tuning of the executor's memory usage.  Overall system throughput and
system responsiveness can be improved by giving hash-based executor
nodes more memory (especially over sort-based alternatives, which are
often much less sensitive to being memory constrained).

The default value for hash_mem_multiplier is 1.0, which is also the
minimum valid value.  This means that hash-based nodes continue to apply
work_mem in the traditional way by default.

hash_mem_multiplier is generally useful.  However, it is being added now
due to concerns about hash aggregate performance stability for users
that upgrade to Postgres 13 (which added disk-based hash aggregation in
commit 1f39bce0).  While the old hash aggregate behavior risked
out-of-memory errors, it is nevertheless likely that many users actually
benefited.  Hash agg's previous indifference to work_mem during query
execution was not just faster; it also accidentally made aggregation
resilient to grouping estimate problems (at least in cases where this
didn't create destabilizing memory pressure).

hash_mem_multiplier can provide a certain kind of continuity with the
behavior of Postgres 12 hash aggregates in cases where the planner
incorrectly estimates that all groups (plus related allocations) will
fit in work_mem/hash_mem.  This seems necessary because hash-based
aggregation is usually much slower when only a small fraction of all
groups can fit.  Even when it isn't possible to totally avoid hash
aggregates that spill, giving hash aggregation more memory will reliably
improve performance (the same cannot be said for external sort
operations, which appear to be almost unaffected by memory availability
provided it's at least possible to get a single merge pass).

The PostgreSQL 13 release notes should advise users that increasing
hash_mem_multiplier can help with performance regressions associated
with hash aggregation.  That can be taken care of by a later commit.

Author: Peter Geoghegan
Reviewed-By: Álvaro Herrera, Jeff Davis
Discussion: https://postgr.es/m/20200625203629.7m6yvut7eqblgmfo@alap3.anarazel.de
Discussion: https://postgr.es/m/CAH2-WzmD%2Bi1pG6rc1%2BCjc4V6EaFJ_qSuKCCHVnH%3DoruqD-zqow%40mail.gmail.com
Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 doc/src/sgml/config.sgml                      | 60 +++++++++++---
 doc/src/sgml/ref/postgres-ref.sgml            |  8 +-
 doc/src/sgml/runtime.sgml                     | 10 ++-
 src/backend/executor/execGrouping.c           |  5 +-
 src/backend/executor/nodeAgg.c                | 30 +++----
 src/backend/executor/nodeHash.c               | 80 ++++++++++++++-----
 src/backend/executor/nodeHashjoin.c           |  4 +-
 src/backend/optimizer/path/costsize.c         | 12 +--
 src/backend/optimizer/plan/planner.c          | 15 ++--
 src/backend/optimizer/plan/subselect.c        |  9 ++-
 src/backend/optimizer/prep/prepunion.c        |  9 ++-
 src/backend/optimizer/util/pathnode.c         |  3 +-
 src/backend/utils/adt/ri_triggers.c           | 18 ++++-
 src/backend/utils/init/globals.c              |  1 +
 src/backend/utils/misc/guc.c                  | 11 +++
 src/backend/utils/misc/postgresql.conf.sample |  1 +
 src/include/executor/hashjoin.h               |  4 +-
 src/include/executor/nodeHash.h               |  2 +-
 src/include/miscadmin.h                       |  4 +
 19 files changed, 205 insertions(+), 81 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 822bbf1f27269..427947cf4962e 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1690,22 +1690,64 @@ include_dir 'conf.d'
       </term>
       <listitem>
        <para>
-        Sets the maximum amount of memory to be used by a query operation
+        Sets the base maximum amount of memory to be used by a query operation
         (such as a sort or hash table) before writing to temporary disk files.
         If this value is specified without units, it is taken as kilobytes.
         The default value is four megabytes (<literal>4MB</literal>).
         Note that for a complex query, several sort or hash operations might be
-        running in parallel; each operation will be allowed to use as much memory
-        as this value specifies before it starts to write data into temporary
-        files. Also, several running sessions could be doing such operations
-        concurrently.  Therefore, the total memory used could be many
-        times the value of <varname>work_mem</varname>; it is necessary to
-        keep this fact in mind when choosing the value. Sort operations are
-        used for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>, and
-        merge joins.
+        running in parallel; each operation will generally be allowed
+        to use as much memory as this value specifies before it starts
+        to write data into temporary files.  Also, several running
+        sessions could be doing such operations concurrently.
+        Therefore, the total memory used could be many times the value
+        of <varname>work_mem</varname>; it is necessary to keep this
+        fact in mind when choosing the value.  Sort operations are used
+        for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>,
+        and merge joins.
         Hash tables are used in hash joins, hash-based aggregation, and
         hash-based processing of <literal>IN</literal> subqueries.
        </para>
+       <para>
+        Hash-based operations are generally more sensitive to memory
+        availability than equivalent sort-based operations.  The
+        memory available for hash tables is computed by multiplying
+        <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  This makes it
+        possible for hash-based operations to use an amount of memory
+        that exceeds the usual <varname>work_mem</varname> base
+        amount.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-hash-mem-multiplier" xreflabel="hash_mem_multiplier">
+      <term><varname>hash_mem_multiplier</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>hash_mem_multiplier</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Used to compute the maximum amount of memory that hash-based
+        operations can use.  The final limit is determined by
+        multiplying <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  The default value is
+        1.0, which makes hash-based operations subject to the same
+        simple <varname>work_mem</varname> maximum as sort-based
+        operations.
+       </para>
+       <para>
+        Consider increasing <varname>hash_mem_multiplier</varname> in
+        environments where spilling by query operations is a regular
+        occurrence, especially when simply increasing
+        <varname>work_mem</varname> results in memory pressure (memory
+        pressure typically takes the form of intermittent out of
+        memory errors).  A setting of 1.5 or 2.0 may be effective with
+        mixed workloads.  Higher settings in the range of 2.0 - 8.0 or
+        more may be effective in environments where
+        <varname>work_mem</varname> has already been increased to 40MB
+        or more.
+       </para>
       </listitem>
      </varlistentry>
 
diff --git a/doc/src/sgml/ref/postgres-ref.sgml b/doc/src/sgml/ref/postgres-ref.sgml
index 5e5794bc90d8b..6e62f54c597c0 100644
--- a/doc/src/sgml/ref/postgres-ref.sgml
+++ b/doc/src/sgml/ref/postgres-ref.sgml
@@ -338,10 +338,10 @@ PostgreSQL documentation
       <term><option>-S</option> <replaceable class="parameter">work-mem</replaceable></term>
       <listitem>
        <para>
-        Specifies the amount of memory to be used by internal sorts and hashes
-        before resorting to temporary disk files.  See the description of the
-        <varname>work_mem</varname> configuration parameter in <xref
-        linkend="runtime-config-resource-memory"/>.
+        Specifies the base amount of memory to be used by sorts and
+        hash tables before resorting to temporary disk files.  See the
+        description of the <varname>work_mem</varname> configuration
+        parameter in <xref linkend="runtime-config-resource-memory"/>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index e09cb55efcd6c..c8698898f3256 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1326,10 +1326,12 @@ Out of Memory: Killed process 12345 (postgres).
     system running out of memory, you can avoid the problem by changing
     your configuration.  In some cases, it may help to lower memory-related
     configuration parameters, particularly
-    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>
-    and <link linkend="guc-work-mem"><varname>work_mem</varname></link>.  In
-    other cases, the problem may be caused by allowing too many connections
-    to the database server itself.  In many cases, it may be better to reduce
+    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>,
+    <link linkend="guc-work-mem"><varname>work_mem</varname></link>, and
+    <link linkend="guc-hash-mem-multiplier"><varname>hash_mem_multiplier</varname></link>.
+    In other cases, the problem may be caused by allowing too many
+    connections to the database server itself.  In many cases, it may
+    be better to reduce
     <link linkend="guc-max-connections"><varname>max_connections</varname></link>
     and instead make use of external connection-pooling software.
    </para>
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 321f427e478fc..90d04f9228aa4 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -165,13 +165,14 @@ BuildTupleHashTableExt(PlanState *parent,
 {
 	TupleHashTable hashtable;
 	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
+	int			hash_mem = get_hash_mem();
 	MemoryContext oldcontext;
 	bool		allow_jit;
 
 	Assert(nbuckets > 0);
 
-	/* Limit initial table size request to not more than work_mem */
-	nbuckets = Min(nbuckets, (long) ((work_mem * 1024L) / entrysize));
+	/* Limit initial table size request to not more than hash_mem */
+	nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize));
 
 	oldcontext = MemoryContextSwitchTo(metacxt);
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 02a9165c6941e..9776263ae75ac 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -203,7 +203,7 @@
  *	  entries (and initialize new transition states), we instead spill them to
  *	  disk to be processed later. The tuples are spilled in a partitioned
  *	  manner, so that subsequent batches are smaller and less likely to exceed
- *	  work_mem (if a batch does exceed work_mem, it must be spilled
+ *	  hash_mem (if a batch does exceed hash_mem, it must be spilled
  *	  recursively).
  *
  *	  Spilled data is written to logical tapes. These provide better control
@@ -212,7 +212,7 @@
  *
  *	  Note that it's possible for transition states to start small but then
  *	  grow very large; for instance in the case of ARRAY_AGG. In such cases,
- *	  it's still possible to significantly exceed work_mem. We try to avoid
+ *	  it's still possible to significantly exceed hash_mem. We try to avoid
  *	  this situation by estimating what will fit in the available memory, and
  *	  imposing a limit on the number of groups separately from the amount of
  *	  memory consumed.
@@ -1516,7 +1516,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 
 	/*
 	 * Used to make sure initial hash table allocation does not exceed
-	 * work_mem. Note that the estimate does not include space for
+	 * hash_mem. Note that the estimate does not include space for
 	 * pass-by-reference transition data values, nor for the representative
 	 * tuple of each group.
 	 */
@@ -1782,7 +1782,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
 }
 
 /*
- * Set limits that trigger spilling to avoid exceeding work_mem. Consider the
+ * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
  * number of partitions we expect to create (if we do spill).
  *
  * There are two limits: a memory limit, and also an ngroups limit. The
@@ -1796,13 +1796,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 {
 	int			npartitions;
 	Size		partition_mem;
+	int			hash_mem = get_hash_mem();
 
-	/* if not expected to spill, use all of work_mem */
-	if (input_groups * hashentrysize < work_mem * 1024L)
+	/* if not expected to spill, use all of hash_mem */
+	if (input_groups * hashentrysize < hash_mem * 1024L)
 	{
 		if (num_partitions != NULL)
 			*num_partitions = 0;
-		*mem_limit = work_mem * 1024L;
+		*mem_limit = hash_mem * 1024L;
 		*ngroups_limit = *mem_limit / hashentrysize;
 		return;
 	}
@@ -1824,14 +1825,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 		HASHAGG_WRITE_BUFFER_SIZE * npartitions;
 
 	/*
-	 * Don't set the limit below 3/4 of work_mem. In that case, we are at the
+	 * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
 	 * minimum number of partitions, so we aren't going to dramatically exceed
 	 * work mem anyway.
 	 */
-	if (work_mem * 1024L > 4 * partition_mem)
-		*mem_limit = work_mem * 1024L - partition_mem;
+	if (hash_mem * 1024L > 4 * partition_mem)
+		*mem_limit = hash_mem * 1024L - partition_mem;
 	else
-		*mem_limit = work_mem * 1024L * 0.75;
+		*mem_limit = hash_mem * 1024L * 0.75;
 
 	if (*mem_limit > hashentrysize)
 		*ngroups_limit = *mem_limit / hashentrysize;
@@ -1989,19 +1990,20 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
 	int			partition_limit;
 	int			npartitions;
 	int			partition_bits;
+	int			hash_mem = get_hash_mem();
 
 	/*
 	 * Avoid creating so many partitions that the memory requirements of the
-	 * open partition files are greater than 1/4 of work_mem.
+	 * open partition files are greater than 1/4 of hash_mem.
 	 */
 	partition_limit =
-		(work_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+		(hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
 		HASHAGG_WRITE_BUFFER_SIZE;
 
 	mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
 
 	/* make enough partitions so that each one is likely to fit in memory */
-	npartitions = 1 + (mem_wanted / (work_mem * 1024L));
+	npartitions = 1 + (mem_wanted / (hash_mem * 1024L));
 
 	if (npartitions > partition_limit)
 		npartitions = partition_limit;
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 45b342011fe5e..ea69eeb2a1e4b 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
 #include "port/atomics.h"
 #include "port/pg_bitutils.h"
 #include "utils/dynahash.h"
+#include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
@@ -506,7 +507,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 	hashtable->spaceAllowed = space_allowed;
 	hashtable->spaceUsedSkew = 0;
 	hashtable->spaceAllowedSkew =
-		hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
+		hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
 	hashtable->chunks = NULL;
 	hashtable->current_chunk = NULL;
 	hashtable->parallel_state = state->parallel_state;
@@ -665,7 +666,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 
 void
 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-						bool try_combined_work_mem,
+						bool try_combined_hash_mem,
 						int parallel_workers,
 						size_t *space_allowed,
 						int *numbuckets,
@@ -682,6 +683,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	int			nbatch = 1;
 	int			nbuckets;
 	double		dbuckets;
+	int			hash_mem = get_hash_mem();
 
 	/* Force a plausible relation size if no info */
 	if (ntuples <= 0.0)
@@ -698,16 +700,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	inner_rel_bytes = ntuples * tupsize;
 
 	/*
-	 * Target in-memory hashtable size is work_mem kilobytes.
+	 * Target in-memory hashtable size is hash_mem kilobytes.
 	 */
-	hash_table_bytes = work_mem * 1024L;
+	hash_table_bytes = hash_mem * 1024L;
 
 	/*
-	 * Parallel Hash tries to use the combined work_mem of all workers to
-	 * avoid the need to batch.  If that won't work, it falls back to work_mem
+	 * Parallel Hash tries to use the combined hash_mem of all workers to
+	 * avoid the need to batch.  If that won't work, it falls back to hash_mem
 	 * per worker and tries to process batches in parallel.
 	 */
-	if (try_combined_work_mem)
+	if (try_combined_hash_mem)
 		hash_table_bytes += hash_table_bytes * parallel_workers;
 
 	*space_allowed = hash_table_bytes;
@@ -728,7 +730,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	 */
 	if (useskew)
 	{
-		skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
+		skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100;
 
 		/*----------
 		 * Divisor is:
@@ -751,7 +753,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	/*
 	 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
 	 * memory is filled, assuming a single batch; but limit the value so that
-	 * the pointer arrays we'll try to allocate do not exceed work_mem nor
+	 * the pointer arrays we'll try to allocate do not exceed hash_mem nor
 	 * MaxAllocSize.
 	 *
 	 * Note that both nbuckets and nbatch must be powers of 2 to make
@@ -790,10 +792,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		long		bucket_size;
 
 		/*
-		 * If Parallel Hash with combined work_mem would still need multiple
-		 * batches, we'll have to fall back to regular work_mem budget.
+		 * If Parallel Hash with combined hash_mem would still need multiple
+		 * batches, we'll have to fall back to regular hash_mem budget.
 		 */
-		if (try_combined_work_mem)
+		if (try_combined_hash_mem)
 		{
 			ExecChooseHashTableSize(ntuples, tupwidth, useskew,
 									false, parallel_workers,
@@ -805,7 +807,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		}
 
 		/*
-		 * Estimate the number of buckets we'll want to have when work_mem is
+		 * Estimate the number of buckets we'll want to have when hash_mem is
 		 * entirely full.  Each bucket will contain a bucket pointer plus
 		 * NTUP_PER_BUCKET tuples, whose projected size already includes
 		 * overhead for the hash code, pointer to the next tuple, etc.
@@ -820,8 +822,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		/*
 		 * Buckets are simple pointers to hashjoin tuples, while tupsize
 		 * includes the pointer, hash code, and MinimalTupleData.  So buckets
-		 * should never really exceed 25% of work_mem (even for
-		 * NTUP_PER_BUCKET=1); except maybe for work_mem values that are not
+		 * should never really exceed 25% of hash_mem (even for
+		 * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
 		 * 2^N bytes, where we might get more because of doubling. So let's
 		 * look for 50% here.
 		 */
@@ -1095,15 +1097,17 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
 				/* Figure out how many batches to use. */
 				if (hashtable->nbatch == 1)
 				{
+					int			hash_mem = get_hash_mem();
+
 					/*
 					 * We are going from single-batch to multi-batch.  We need
 					 * to switch from one large combined memory budget to the
-					 * regular work_mem budget.
+					 * regular hash_mem budget.
 					 */
-					pstate->space_allowed = work_mem * 1024L;
+					pstate->space_allowed = hash_mem * 1024L;
 
 					/*
-					 * The combined work_mem of all participants wasn't
+					 * The combined hash_mem of all participants wasn't
 					 * enough. Therefore one batch per participant would be
 					 * approximately equivalent and would probably also be
 					 * insufficient.  So try two batches per participant,
@@ -2855,7 +2859,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
 
 		/*
 		 * Check if our space limit would be exceeded.  To avoid choking on
-		 * very large tuples or very low work_mem setting, we'll always allow
+		 * very large tuples or very low hash_mem setting, we'll always allow
 		 * each backend to allocate at least one chunk.
 		 */
 		if (hashtable->batches[0].at_least_one_chunk &&
@@ -3366,3 +3370,41 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
 
 	return true;
 }
+
+/*
+ * Get a hash_mem value by multiplying the work_mem GUC's value by the
+ * hash_mem_multiplier GUC's value.
+ *
+ * Returns a work_mem style KB value that hash-based nodes (including but not
+ * limited to hash join) use in place of work_mem.  This is subject to the
+ * same restrictions as work_mem itself.  (There is no such thing as the
+ * hash_mem GUC, but it's convenient for our callers to pretend that there
+ * is.)
+ *
+ * Exported for use by the planner, as well as other hash-based executor
+ * nodes.  This is a rather random place for this, but there is no better
+ * place.
+ */
+int
+get_hash_mem(void)
+{
+	double		hash_mem;
+
+	Assert(hash_mem_multiplier >= 1.0);
+
+	hash_mem = (double) work_mem * hash_mem_multiplier;
+
+	/*
+	 * guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
+	 * support the assumption that raw derived byte values can be stored in
+	 * 'long' variables.  The returned hash_mem value must also meet this
+	 * assumption.
+	 *
+	 * We clamp the final value rather than throw an error because it should
+	 * be possible to set work_mem and hash_mem_multiplier independently.
+	 */
+	if (hash_mem < MAX_KILOBYTES)
+		return (int) hash_mem;
+
+	return MAX_KILOBYTES;
+}
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 9bb23fef1a6e9..5532b91a71dca 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -89,9 +89,9 @@
  * PHJ_BUILD_HASHING_INNER so we can skip loading.
  *
  * Initially we try to plan for a single-batch hash join using the combined
- * work_mem of all participants to create a large shared hash table.  If that
+ * hash_mem of all participants to create a large shared hash table.  If that
  * turns out either at planning or execution time to be impossible then we
- * fall back to regular work_mem sized hash tables.
+ * fall back to regular hash_mem sized hash tables.
  *
  * To avoid deadlocks, we never wait for any barrier unless it is known that
  * all other backends attached to it are actively executing the node or have
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 27ce4cc8069b2..fda4b2c6e875f 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -3525,7 +3525,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	 * Get hash table size that executor would use for inner relation.
 	 *
 	 * XXX for the moment, always assume that skew optimization will be
-	 * performed.  As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
+	 * performed.  As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
 	 * trying to determine that for sure.
 	 *
 	 * XXX at some point it might be interesting to try to account for skew
@@ -3534,7 +3534,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	ExecChooseHashTableSize(inner_path_rows_total,
 							inner_path->pathtarget->width,
 							true,	/* useskew */
-							parallel_hash,	/* try_combined_work_mem */
+							parallel_hash,	/* try_combined_hash_mem */
 							outer_path->parallel_workers,
 							&space_allowed,
 							&numbuckets,
@@ -3597,6 +3597,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	Cost		run_cost = workspace->run_cost;
 	int			numbuckets = workspace->numbuckets;
 	int			numbatches = workspace->numbatches;
+	int			hash_mem;
 	Cost		cpu_per_tuple;
 	QualCost	hash_qual_cost;
 	QualCost	qp_qual_cost;
@@ -3715,16 +3716,17 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	}
 
 	/*
-	 * If the bucket holding the inner MCV would exceed work_mem, we don't
+	 * If the bucket holding the inner MCV would exceed hash_mem, we don't
 	 * want to hash unless there is really no other alternative, so apply
 	 * disable_cost.  (The executor normally copes with excessive memory usage
 	 * by splitting batches, but obviously it cannot separate equal values
-	 * that way, so it will be unable to drive the batch size below work_mem
+	 * that way, so it will be unable to drive the batch size below hash_mem
 	 * when this is true.)
 	 */
+	hash_mem = get_hash_mem();
 	if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
 						   inner_path->pathtarget->width) >
-		(work_mem * 1024L))
+		(hash_mem * 1024L))
 		startup_cost += disable_cost;
 
 	/*
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 1345e522dcf5e..b40a112c25b23 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4196,16 +4196,17 @@ consider_groupingsets_paths(PlannerInfo *root,
 							double dNumGroups)
 {
 	Query	   *parse = root->parse;
+	int			hash_mem = get_hash_mem();
 
 	/*
 	 * If we're not being offered sorted input, then only consider plans that
 	 * can be done entirely by hashing.
 	 *
-	 * We can hash everything if it looks like it'll fit in work_mem. But if
+	 * We can hash everything if it looks like it'll fit in hash_mem. But if
 	 * the input is actually sorted despite not being advertised as such, we
 	 * prefer to make use of that in order to use less memory.
 	 *
-	 * If none of the grouping sets are sortable, then ignore the work_mem
+	 * If none of the grouping sets are sortable, then ignore the hash_mem
 	 * limit and generate a path anyway, since otherwise we'll just fail.
 	 */
 	if (!is_sorted)
@@ -4257,10 +4258,10 @@ consider_groupingsets_paths(PlannerInfo *root,
 
 		/*
 		 * gd->rollups is empty if we have only unsortable columns to work
-		 * with.  Override work_mem in that case; otherwise, we'll rely on the
+		 * with.  Override hash_mem in that case; otherwise, we'll rely on the
 		 * sorted-input case to generate usable mixed paths.
 		 */
-		if (hashsize > work_mem * 1024L && gd->rollups)
+		if (hashsize > hash_mem * 1024L && gd->rollups)
 			return;				/* nope, won't fit */
 
 		/*
@@ -4379,7 +4380,7 @@ consider_groupingsets_paths(PlannerInfo *root,
 	{
 		List	   *rollups = NIL;
 		List	   *hash_sets = list_copy(gd->unsortable_sets);
-		double		availspace = (work_mem * 1024.0);
+		double		availspace = (hash_mem * 1024.0);
 		ListCell   *lc;
 
 		/*
@@ -4400,7 +4401,7 @@ consider_groupingsets_paths(PlannerInfo *root,
 
 			/*
 			 * We treat this as a knapsack problem: the knapsack capacity
-			 * represents work_mem, the item weights are the estimated memory
+			 * represents hash_mem, the item weights are the estimated memory
 			 * usage of the hashtables needed to implement a single rollup,
 			 * and we really ought to use the cost saving as the item value;
 			 * however, currently the costs assigned to sort nodes don't
@@ -4441,7 +4442,7 @@ consider_groupingsets_paths(PlannerInfo *root,
 																rollup->numGroups);
 
 					/*
-					 * If sz is enormous, but work_mem (and hence scale) is
+					 * If sz is enormous, but hash_mem (and hence scale) is
 					 * small, avoid integer overflow here.
 					 */
 					k_weights[i] = (int) Min(floor(sz / scale),
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index b02fcb9bfe7ac..9a8f738c9d05b 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -200,7 +200,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
 	 * XXX If an ANY subplan is uncorrelated, build_subplan may decide to hash
 	 * its output.  In that case it would've been better to specify full
 	 * retrieval.  At present, however, we can only check hashability after
-	 * we've made the subplan :-(.  (Determining whether it'll fit in work_mem
+	 * we've made the subplan :-(.  (Determining whether it'll fit in hash_mem
 	 * is the really hard part.)  Therefore, we don't want to be too
 	 * optimistic about the percentage of tuples retrieved, for fear of
 	 * selecting a plan that's bad for the materialization case.
@@ -278,7 +278,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
 
 			plan = create_plan(subroot, best_path);
 
-			/* Now we can check if it'll fit in work_mem */
+			/* Now we can check if it'll fit in hash_mem */
 			/* XXX can we check this at the Path stage? */
 			if (subplan_is_hashable(plan))
 			{
@@ -716,16 +716,17 @@ static bool
 subplan_is_hashable(Plan *plan)
 {
 	double		subquery_size;
+	int			hash_mem = get_hash_mem();
 
 	/*
-	 * The estimated size of the subquery result must fit in work_mem. (Note:
+	 * The estimated size of the subquery result must fit in hash_mem. (Note:
 	 * we use heap tuple overhead here even though the tuples will actually be
 	 * stored as MinimalTuples; this provides some fudge factor for hashtable
 	 * overhead.)
 	 */
 	subquery_size = plan->plan_rows *
 		(MAXALIGN(plan->plan_width) + MAXALIGN(SizeofHeapTupleHeader));
-	if (subquery_size > work_mem * 1024L)
+	if (subquery_size > hash_mem * 1024L)
 		return false;
 
 	return true;
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 6588f83d5ec6f..2ebd4ea332071 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1018,6 +1018,7 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 					const char *construct)
 {
 	int			numGroupCols = list_length(groupClauses);
+	int			hash_mem = get_hash_mem();
 	bool		can_sort;
 	bool		can_hash;
 	Size		hashentrysize;
@@ -1049,15 +1050,17 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 
 	/*
 	 * Don't do it if it doesn't look like the hashtable will fit into
-	 * work_mem.
+	 * hash_mem.
 	 */
 	hashentrysize = MAXALIGN(input_path->pathtarget->width) + MAXALIGN(SizeofMinimalTupleHeader);
 
-	if (hashentrysize * dNumGroups > work_mem * 1024L)
+	if (hashentrysize * dNumGroups > hash_mem * 1024L)
 		return false;
 
 	/*
-	 * See if the estimated cost is no more than doing it the other way.
+	 * See if the estimated cost is no more than doing it the other way.  We
+	 * deliberately give the hash case more memory when hash_mem exceeds
+	 * standard work mem (i.e. when hash_mem_multiplier exceeds 1.0).
 	 *
 	 * We need to consider input_plan + hashagg versus input_plan + sort +
 	 * group.  Note that the actual result plan might involve a SetOp or
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 5110a6b806017..c1fc866cbf911 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1720,8 +1720,9 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 		 * planner.c).
 		 */
 		int			hashentrysize = subpath->pathtarget->width + 64;
+		int			hash_mem = get_hash_mem();
 
-		if (hashentrysize * pathnode->path.rows > work_mem * 1024L)
+		if (hashentrysize * pathnode->path.rows > hash_mem * 1024L)
 		{
 			/*
 			 * We should not try to hash.  Hack the SpecialJoinInfo to
diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c
index bb49e80d1665b..06cf16d9d7164 100644
--- a/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@ -1450,7 +1450,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	 * enough to not use a multiple of work_mem, and one typically would not
 	 * have many large foreign-key validations happening concurrently.  So
 	 * this seems to meet the criteria for being considered a "maintenance"
-	 * operation, and accordingly we use maintenance_work_mem.
+	 * operation, and accordingly we use maintenance_work_mem.  However, we
+	 * must also set hash_mem_multiplier to 1, since it is surely not okay to
+	 * let that get applied to the maintenance_work_mem value.
 	 *
 	 * We use the equivalent of a function SET option to allow the setting to
 	 * persist for exactly the duration of the check query.  guc.c also takes
@@ -1462,6 +1464,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	(void) set_config_option("work_mem", workmembuf,
 							 PGC_USERSET, PGC_S_SESSION,
 							 GUC_ACTION_SAVE, true, 0, false);
+	(void) set_config_option("hash_mem_multiplier", "1",
+							 PGC_USERSET, PGC_S_SESSION,
+							 GUC_ACTION_SAVE, true, 0, false);
 
 	if (SPI_connect() != SPI_OK_CONNECT)
 		elog(ERROR, "SPI_connect failed");
@@ -1553,7 +1558,7 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 		elog(ERROR, "SPI_finish failed");
 
 	/*
-	 * Restore work_mem.
+	 * Restore work_mem and hash_mem_multiplier.
 	 */
 	AtEOXact_GUC(true, save_nestlevel);
 
@@ -1685,7 +1690,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	 * enough to not use a multiple of work_mem, and one typically would not
 	 * have many large foreign-key validations happening concurrently.  So
 	 * this seems to meet the criteria for being considered a "maintenance"
-	 * operation, and accordingly we use maintenance_work_mem.
+	 * operation, and accordingly we use maintenance_work_mem.  However, we
+	 * must also set hash_mem_multiplier to 1, since it is surely not okay to
+	 * let that get applied to the maintenance_work_mem value.
 	 *
 	 * We use the equivalent of a function SET option to allow the setting to
 	 * persist for exactly the duration of the check query.  guc.c also takes
@@ -1697,6 +1704,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	(void) set_config_option("work_mem", workmembuf,
 							 PGC_USERSET, PGC_S_SESSION,
 							 GUC_ACTION_SAVE, true, 0, false);
+	(void) set_config_option("hash_mem_multiplier", "1",
+							 PGC_USERSET, PGC_S_SESSION,
+							 GUC_ACTION_SAVE, true, 0, false);
 
 	if (SPI_connect() != SPI_OK_CONNECT)
 		elog(ERROR, "SPI_connect failed");
@@ -1763,7 +1773,7 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 		elog(ERROR, "SPI_finish failed");
 
 	/*
-	 * Restore work_mem.
+	 * Restore work_mem and hash_mem_multiplier.
 	 */
 	AtEOXact_GUC(true, save_nestlevel);
 }
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 497d7c38ae6fd..6ab8216839891 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -119,6 +119,7 @@ int			IntervalStyle = INTSTYLE_POSTGRES;
 bool		enableFsync = true;
 bool		allowSystemTableMods = false;
 int			work_mem = 4096;
+double		hash_mem_multiplier = 1.0;
 int			maintenance_work_mem = 65536;
 int			max_parallel_maintenance_workers = 2;
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index abfa95a2314bf..c20885e97b203 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3542,6 +3542,17 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"hash_mem_multiplier", PGC_USERSET, RESOURCES_MEM,
+			gettext_noop("Multiple of work_mem to use for hash tables."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&hash_mem_multiplier,
+		1.0, 1.0, 1000.0,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"bgwriter_lru_multiplier", PGC_SIGHUP, RESOURCES_BGWRITER,
 			gettext_noop("Multiple of the average buffer usage to free per round."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 5a0b8e982179e..aa30291ea3964 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -130,6 +130,7 @@
 # Caution: it is not advisable to set max_prepared_transactions nonzero unless
 # you actively intend to use prepared transactions.
 #work_mem = 4MB				# min 64kB
+#hash_mem_multiplier = 1.0		# 1-1000.0 multiplier on hash table work_mem
 #maintenance_work_mem = 64MB		# min 1MB
 #autovacuum_work_mem = -1		# min 1MB, or -1 to use maintenance_work_mem
 #logical_decoding_work_mem = 64MB	# min 64kB
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 79b634e8ed10e..eb5daba36b0ff 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -88,7 +88,7 @@ typedef struct HashJoinTupleData
  * outer relation tuples with these hash values are matched against that
  * table instead of the main one.  Thus, tuples with these hash values are
  * effectively handled as part of the first batch and will never go to disk.
- * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
+ * The skew hashtable is limited to SKEW_HASH_MEM_PERCENT of the total memory
  * allowed for the join; while building the hashtables, we decrease the number
  * of MCVs being specially treated if needed to stay under this limit.
  *
@@ -107,7 +107,7 @@ typedef struct HashSkewBucket
 
 #define SKEW_BUCKET_OVERHEAD  MAXALIGN(sizeof(HashSkewBucket))
 #define INVALID_SKEW_BUCKET_NO	(-1)
-#define SKEW_WORK_MEM_PERCENT  2
+#define SKEW_HASH_MEM_PERCENT  2
 #define SKEW_MIN_OUTER_FRACTION  0.01
 
 /*
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index 64d2ce693ca7d..2db4e2f67267b 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -61,7 +61,7 @@ extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate,
 extern void ExecHashTableReset(HashJoinTable hashtable);
 extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-									bool try_combined_work_mem,
+									bool try_combined_hash_mem,
 									int parallel_workers,
 									size_t *space_allowed,
 									int *numbuckets,
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 18bc8a7b9045c..72e33523984f4 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -243,6 +243,7 @@ extern PGDLLIMPORT int IntervalStyle;
 extern bool enableFsync;
 extern PGDLLIMPORT bool allowSystemTableMods;
 extern PGDLLIMPORT int work_mem;
+extern PGDLLIMPORT double hash_mem_multiplier;
 extern PGDLLIMPORT int maintenance_work_mem;
 extern PGDLLIMPORT int max_parallel_maintenance_workers;
 
@@ -469,4 +470,7 @@ extern bool has_rolreplication(Oid roleid);
 extern bool BackupInProgress(void);
 extern void CancelBackup(void);
 
+/* in executor/nodeHash.c */
+extern int	get_hash_mem(void);
+
 #endif							/* MISCADMIN_H */

From 3347c982bab0dd56d5b6cb784521233ba2bbac27 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 30 Jul 2020 17:08:11 +1200
Subject: [PATCH 34/54] Use a long lived WaitEventSet for WaitLatch().

Create LatchWaitSet at backend startup time, and use it to implement
WaitLatch().  This avoids repeated epoll/kqueue setup and teardown
system calls.

Reorder SubPostmasterMain() slightly so that we restore the postmaster
pipe and Windows signal emulation before we reach InitPostmasterChild(),
to make this work in EXEC_BACKEND builds.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/postmaster.c | 24 ++++++-------
 src/backend/storage/ipc/latch.c     | 56 ++++++++++++++++++++++++++---
 src/backend/utils/init/miscinit.c   |  2 ++
 src/include/storage/latch.h         |  1 +
 4 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 1db6a3d29d01e..5b5fc97c72dae 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -4896,9 +4896,6 @@ SubPostmasterMain(int argc, char *argv[])
 	IsPostmasterEnvironment = true;
 	whereToSendOutput = DestNone;
 
-	/* Setup as postmaster child */
-	InitPostmasterChild();
-
 	/* Setup essential subsystems (to ensure elog() behaves sanely) */
 	InitializeGUCOptions();
 
@@ -4913,6 +4910,18 @@ SubPostmasterMain(int argc, char *argv[])
 	/* Close the postmaster's sockets (as soon as we know them) */
 	ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0);
 
+	/*
+	 * Start our win32 signal implementation. This has to be done after we
+	 * read the backend variables, because we need to pick up the signal pipe
+	 * from the parent process.
+	 */
+#ifdef WIN32
+	pgwin32_signal_initialize();
+#endif
+
+	/* Setup as postmaster child */
+	InitPostmasterChild();
+
 	/*
 	 * Set up memory area for GSS information. Mirrors the code in ConnCreate
 	 * for the non-exec case.
@@ -4956,15 +4965,6 @@ SubPostmasterMain(int argc, char *argv[])
 	if (strcmp(argv[1], "--forkavworker") == 0)
 		AutovacuumWorkerIAm();
 
-	/*
-	 * Start our win32 signal implementation. This has to be done after we
-	 * read the backend variables, because we need to pick up the signal pipe
-	 * from the parent process.
-	 */
-#ifdef WIN32
-	pgwin32_signal_initialize();
-#endif
-
 	/* In EXEC_BACKEND case we will not have inherited these settings */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 91fa4b619b8c5..4153cc85579f7 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -56,6 +56,7 @@
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
+#include "utils/memutils.h"
 
 /*
  * Select the fd readiness primitive to use. Normally the "most modern"
@@ -129,6 +130,12 @@ struct WaitEventSet
 #endif
 };
 
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *LatchWaitSet;
+
+/* The position of the latch in LatchWaitSet. */
+#define LatchWaitSetLatchPos 0
+
 #ifndef WIN32
 /* Are we currently in WaitLatch? The signal handler would like to know. */
 static volatile sig_atomic_t waiting = false;
@@ -242,6 +249,24 @@ InitializeLatchSupport(void)
 #endif
 }
 
+void
+InitializeLatchWaitSet(void)
+{
+	int			latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(LatchWaitSet == NULL);
+
+	/* Set up the WaitEventSet used by WaitLatch(). */
+	LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+	latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	if (IsUnderPostmaster)
+		AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
+						  PGINVALID_SOCKET, NULL, NULL);
+
+	Assert(latch_pos == LatchWaitSetLatchPos);
+}
+
 /*
  * Initialize a process-local latch.
  */
@@ -365,8 +390,31 @@ int
 WaitLatch(Latch *latch, int wakeEvents, long timeout,
 		  uint32 wait_event_info)
 {
-	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
-							 wait_event_info);
+	WaitEvent	event;
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	/*
+	 * Some callers may have a latch other than MyLatch, or no latch at all,
+	 * or want to handle postmaster death differently.  It's cheap to assign
+	 * those, so just do it every time.
+	 */
+	if (!(wakeEvents & WL_LATCH_SET))
+		latch = NULL;
+	ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
+	LatchWaitSet->exit_on_postmaster_death =
+		((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+	if (WaitEventSetWait(LatchWaitSet,
+						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+						 &event, 1,
+						 wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
 }
 
 /*
@@ -830,7 +878,8 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 
 /*
  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
- * with the WaitEvent.
+ * with the WaitEvent.  The latch may be changed to NULL to disable the latch
+ * temporarily, and then set back to a latch later.
  *
  * 'pos' is the id returned by AddWaitEventToSet.
  */
@@ -862,7 +911,6 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 	if (event->events & WL_LATCH_SET &&
 		events != event->events)
 	{
-		/* we could allow to disable latch events for a while */
 		elog(ERROR, "cannot modify latch event");
 	}
 
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index cca9704d2d7af..cf8f9579c345f 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -120,6 +120,7 @@ InitPostmasterChild(void)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeLatchWaitSet();
 
 	/*
 	 * If possible, make this process a group leader, so that the postmaster
@@ -152,6 +153,7 @@ InitStandaloneProcess(const char *argv0)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeLatchWaitSet();
 
 	/* Compute paths, no postmaster to inherit from */
 	if (my_exec_path[0] == '\0')
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 46ae56cae3f1e..7c742021fb1b1 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -176,6 +176,7 @@ extern int	WaitLatch(Latch *latch, int wakeEvents, long timeout,
 					  uint32 wait_event_info);
 extern int	WaitLatchOrSocket(Latch *latch, int wakeEvents,
 							  pgsocket sock, long timeout, uint32 wait_event_info);
+extern void InitializeLatchWaitSet(void);
 
 /*
  * Unix implementation uses SIGUSR1 for inter-process signaling.

From e2d394df5df28ab5ee4bfac6b13837e99e24045d Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 30 Jul 2020 17:23:32 +1200
Subject: [PATCH 35/54] Use WaitLatch() for condition variables.

Previously, condition_variable.c created a long lived WaitEventSet to
avoid extra system calls.  WaitLatch() now uses something similar
internally, so there is no point in wasting an extra kernel descriptor.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/lmgr/condition_variable.c | 28 ++++---------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
index 37b6a4eecdb68..2ec00397b491b 100644
--- a/src/backend/storage/lmgr/condition_variable.c
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -30,9 +30,6 @@
 /* Initially, we are not prepared to sleep on any condition variable. */
 static ConditionVariable *cv_sleep_target = NULL;
 
-/* Reusable WaitEventSet. */
-static WaitEventSet *cv_wait_event_set = NULL;
-
 /*
  * Initialize a condition variable.
  */
@@ -62,23 +59,6 @@ ConditionVariablePrepareToSleep(ConditionVariable *cv)
 {
 	int			pgprocno = MyProc->pgprocno;
 
-	/*
-	 * If first time through in this process, create a WaitEventSet, which
-	 * we'll reuse for all condition variable sleeps.
-	 */
-	if (cv_wait_event_set == NULL)
-	{
-		WaitEventSet *new_event_set;
-
-		new_event_set = CreateWaitEventSet(TopMemoryContext, 2);
-		AddWaitEventToSet(new_event_set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(new_event_set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		/* Don't set cv_wait_event_set until we have a correct WES. */
-		cv_wait_event_set = new_event_set;
-	}
-
 	/*
 	 * If some other sleep is already prepared, cancel it; this is necessary
 	 * because we have just one static variable tracking the prepared sleep,
@@ -135,6 +115,7 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 	long		cur_timeout = -1;
 	instr_time	start_time;
 	instr_time	cur_time;
+	int			wait_events;
 
 	/*
 	 * If the caller didn't prepare to sleep explicitly, then do so now and
@@ -166,19 +147,20 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 		INSTR_TIME_SET_CURRENT(start_time);
 		Assert(timeout >= 0 && timeout <= INT_MAX);
 		cur_timeout = timeout;
+		wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
 	}
+	else
+		wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
 
 	while (true)
 	{
-		WaitEvent	event;
 		bool		done = false;
 
 		/*
 		 * Wait for latch to be set.  (If we're awakened for some other
 		 * reason, the code below will cope anyway.)
 		 */
-		(void) WaitEventSetWait(cv_wait_event_set, cur_timeout, &event, 1,
-								wait_event_info);
+		(void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
 
 		/* Reset latch before examining the state of the wait list. */
 		ResetLatch(MyLatch);

From e7591fd3cae6c64236ef29d3c87e69b96608a19b Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 30 Jul 2020 17:25:48 +1200
Subject: [PATCH 36/54] Introduce a WaitEventSet for the stats collector.

This avoids avoids some epoll/kqueue system calls for every wait.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/pgstat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 88992c2da2c84..15f92b66c6ba4 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4458,6 +4458,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	WaitEvent	event;
+	WaitEventSet *wes;
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
@@ -4485,6 +4487,12 @@ PgstatCollectorMain(int argc, char *argv[])
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
 
+	/* Prepare to wait for our latch or data in our socket. */
+	wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
+
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
 	 * death of our parent postmaster.
@@ -4672,10 +4680,7 @@ PgstatCollectorMain(int argc, char *argv[])
 
 		/* Sleep until there's something to do */
 #ifndef WIN32
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-							   pgStatSock, -1L,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN);
 #else
 
 		/*
@@ -4688,18 +4693,15 @@ PgstatCollectorMain(int argc, char *argv[])
 		 * to not provoke "using stale statistics" complaints from
 		 * backend_read_statsfile.
 		 */
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-							   pgStatSock,
-							   2 * 1000L /* msec */ ,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, 2 * 1000L /* msec */ , &event, 1,
+							  WAIT_EVENT_PGSTAT_MAIN);
 #endif
 
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
 		 */
-		if (wr & WL_POSTMASTER_DEATH)
+		if (wr == 1 && event.events == WL_POSTMASTER_DEATH)
 			break;
 	}							/* end of outer loop */
 
@@ -4708,6 +4710,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pgstat_write_statsfiles(true, true);
 
+	FreeWaitEventSet(wes);
+
 	exit(0);
 }
 

From 903134fcc0ccd188803fdbc2b7c06b898749153a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 30 Jul 2020 15:48:44 +0900
Subject: [PATCH 37/54] doc: Mention index references in pg_inherits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Partitioned indexes are also registered in pg_inherits, but the
description of this catalog did not reflect that.

Author: Dagfinn Ilmari Mannsåker
Discussion: https://postgr.es/m/87k0ynj35y.fsf@wibble.ilmari.org
Backpatch-through: 11
---
 doc/src/sgml/catalogs.sgml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index a99c681887b5c..26fda20d19394 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -4417,9 +4417,9 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
 
   <para>
    The catalog <structname>pg_inherits</structname> records information about
-   table inheritance hierarchies.  There is one entry for each direct
-   parent-child table relationship in the database.  (Indirect inheritance can be determined
-   by following chains of entries.)
+   table and index inheritance hierarchies.  There is one entry for each direct
+   parent-child table or index relationship in the database.  (Indirect
+   inheritance can be determined by following chains of entries.)
   </para>
 
   <table>
@@ -4443,7 +4443,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>oid</structfield>)
       </para>
       <para>
-       The OID of the child table
+       The OID of the child table or index
       </para></entry>
      </row>
 
@@ -4453,7 +4453,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>oid</structfield>)
       </para>
       <para>
-       The OID of the parent table
+       The OID of the parent table or index
       </para></entry>
      </row>
 
@@ -4465,6 +4465,10 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        If there is more than one direct parent for a child table (multiple
        inheritance), this number tells the order in which the
        inherited columns are to be arranged.  The count starts at 1.
+      </para>
+      <para>
+       Indexes can not have multiple inheritance, since they can only inherit
+       when using declarative partitioning.
       </para></entry>
      </row>
     </tbody>

From f1af75c5f2516ec5b20cfe4b3a474071a318ae1e Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 30 Jul 2020 16:57:37 +0900
Subject: [PATCH 38/54] Include partitioned tables for tab completion of VACUUM
 in psql

The relkinds that support indexing are the same as the ones supporting
VACUUM, so the code gets refactored a bit with the completion query used
for CLUSTER, but there is no change for CLUSTER in this commit.

Author: Justin Pryzby
Reviewed-by: Fujii Masao, Michael Paquier, Masahiko Sawada
Discussion: https://postgr.es/m/20200728170408.GI20393@telsasoft.com
---
 src/bin/psql/tab-complete.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 8b735476ade43..c4af40bfa9fa7 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -330,6 +330,9 @@ do { \
 
 /*
  * Assembly instructions for schema queries
+ *
+ * Note that toast tables are not included in those queries to avoid
+ * unnecessary bloat in the completions generated.
  */
 
 static const SchemaQuery Query_for_list_of_aggregates[] = {
@@ -573,8 +576,14 @@ static const SchemaQuery Query_for_list_of_indexables = {
 	.result = "pg_catalog.quote_ident(c.relname)",
 };
 
-/* Relations supporting VACUUM */
-static const SchemaQuery Query_for_list_of_vacuumables = {
+/*
+ * Relations supporting VACUUM are currently same as those supporting
+ * indexing.
+ */
+#define Query_for_list_of_vacuumables Query_for_list_of_indexables
+
+/* Relations supporting CLUSTER */
+static const SchemaQuery Query_for_list_of_clusterables = {
 	.catname = "pg_catalog.pg_class c",
 	.selcondition =
 	"c.relkind IN (" CppAsString2(RELKIND_RELATION) ", "
@@ -584,9 +593,6 @@ static const SchemaQuery Query_for_list_of_vacuumables = {
 	.result = "pg_catalog.quote_ident(c.relname)",
 };
 
-/* Relations supporting CLUSTER are currently same as those supporting VACUUM */
-#define Query_for_list_of_clusterables Query_for_list_of_vacuumables
-
 static const SchemaQuery Query_for_list_of_constraints_with_schema = {
 	.catname = "pg_catalog.pg_constraint c",
 	.selcondition = "c.conrelid <> 0",

From fd734f387d8780d9989d750942d026167de8cf3c Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Thu, 30 Jul 2020 08:44:58 -0700
Subject: [PATCH 39/54] Use pg_bitutils for HyperLogLog.

Using pg_leftmost_one_post32() yields substantial performance benefits.

Backpatching to version 13 because HLL is used for HashAgg
improvements in 9878b643, which was also backpatched to 13.

Reviewed-by: Peter Geoghegan
Discussion: https://postgr.es/m/CAH2-WzkGvDKVDo+0YvfvZ+1CE=iCi88DCOGFF3i1hTGGaxcKPw@mail.gmail.com
Backpatch-through: 13
---
 src/backend/lib/hyperloglog.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
index a5cc1f8b83b4e..351fed8186fb2 100644
--- a/src/backend/lib/hyperloglog.c
+++ b/src/backend/lib/hyperloglog.c
@@ -49,6 +49,7 @@
 #include <math.h>
 
 #include "lib/hyperloglog.h"
+#include "port/pg_bitutils.h"
 
 #define POW_2_32			(4294967296.0)
 #define NEG_POW_2_32		(-4294967296.0)
@@ -242,11 +243,13 @@ rho(uint32 x, uint8 b)
 {
 	uint8		j = 1;
 
-	while (j <= b && !(x & 0x80000000))
-	{
-		j++;
-		x <<= 1;
-	}
+	if (x == 0)
+		return b + 1;
+
+	j = 32 - pg_leftmost_one_pos32(x);
+
+	if (j > b)
+		return b + 1;
 
 	return j;
 }

From cab2556f3ab289b81a9c6a75e669b6ec78356ffc Mon Sep 17 00:00:00 2001
From: Tatsuo Ishii <ishii@postgresql.org>
Date: Fri, 31 Jul 2020 07:18:41 +0900
Subject: [PATCH 40/54] Doc: fix high availability solutions comparison.

In "High Availability, Load Balancing, and Replication" chapter,
certain descriptions of Pgpool-II were not correct at this point.  It
does not need conflict resolution. Also "Multiple-Server Parallel
Query Execution" is not supported anymore.

Discussion: https://postgr.es/m/20200726.230128.53842489850344110.t-ishii%40sraoss.co.jp
Author: Tatsuo Ishii
Reviewed-by: Bruce Momjian
Backpatch-through: 9.5
---
 doc/src/sgml/high-availability.sgml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index 89f6d6eda6362..a824d383f2d89 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -429,7 +429,7 @@ protocol to make nodes agree on a serializable transactional order.
      <entry align="center">&bull;</entry>
      <entry align="center"></entry>
      <entry align="center">&bull;</entry>
-     <entry align="center"></entry>
+     <entry align="center">&bull;</entry>
      <entry align="center"></entry>
      <entry align="center">&bull;</entry>
     </row>
@@ -471,8 +471,7 @@ protocol to make nodes agree on a serializable transactional order.
      concurrently on a single query.  It is usually accomplished by
      splitting the data among servers and having each server execute its
      part of the query and return results to a central server where they
-     are combined and returned to the user.  <productname>Pgpool-II</productname>
-     has this capability.  Also, this can be implemented using the
+     are combined and returned to the user. This can be implemented using the
      <productname>PL/Proxy</productname> tool set.
     </para>
 

From e3931d01f3afef14703827eda1dad0a3fb3b5d07 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 31 Jul 2020 10:54:26 +0900
Subject: [PATCH 41/54] Use multi-inserts for pg_attribute and pg_shdepend

For pg_attribute, this allows to insert at once a full set of attributes
for a relation (roughly 15% of WAL reduction in extreme cases).  For
pg_shdepend, this reduces the work done when creating new shared
dependencies from a database template.  The number of slots used for the
insertion is capped at 64kB of data inserted for both, depending on the
number of items to insert and the length of the rows involved.

More can be done for other catalogs, like pg_depend.  This part requires
a different approach as the number of slots to use depends also on the
number of entries discarded as pinned dependencies.  This is also
related to the rework or dependency handling for ALTER TABLE and CREATE
TABLE, mainly.

Author: Daniel Gustafsson
Reviewed-by: Andres Freund, Michael Paquier
Discussion: https://postgr.es/m/20190213182737.mxn6hkdxwrzgxk35@alap3.anarazel.de
---
 src/backend/access/heap/heapam.c  |   8 +-
 src/backend/catalog/heap.c        | 199 ++++++++++++++++++------------
 src/backend/catalog/index.c       |  19 +--
 src/backend/catalog/indexing.c    |  36 ++++++
 src/backend/catalog/pg_shdepend.c |  60 ++++++---
 src/backend/commands/tablecmds.c  |   8 +-
 src/include/catalog/heap.h        |   9 +-
 src/include/catalog/indexing.h    |   5 +
 8 files changed, 225 insertions(+), 119 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 8df2716de46cc..5eef225f5c791 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2164,8 +2164,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 		RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
 
 		/*
-		 * Note that heap_multi_insert is not used for catalog tuples yet, but
-		 * this will cover the gap once that is the case.
+		 * For logical decoding we need combocids to properly decode the
+		 * catalog.
 		 */
 		if (needwal && need_cids)
 			log_heap_new_cid(relation, heaptuples[ndone]);
@@ -2180,8 +2180,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 			RelationPutHeapTuple(relation, buffer, heaptup, false);
 
 			/*
-			 * We don't use heap_multi_insert for catalog tuples yet, but
-			 * better be prepared...
+			 * For logical decoding we need combocids to properly decode the
+			 * catalog.
 			 */
 			if (needwal && need_cids)
 				log_heap_new_cid(relation, heaptup);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 3985326df62f7..f2ca686397ebd 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -710,70 +710,122 @@ CheckAttributeType(const char *attname,
 }
 
 /*
- * InsertPgAttributeTuple
- *		Construct and insert a new tuple in pg_attribute.
+ * Cap the maximum amount of bytes allocated for InsertPgAttributeTuples()
+ * slots.
+ */
+#define MAX_PGATTRIBUTE_INSERT_BYTES 65535
+
+/*
+ * InsertPgAttributeTuples
+ *		Construct and insert a set of tuples in pg_attribute.
  *
- * Caller has already opened and locked pg_attribute.  new_attribute is the
- * attribute to insert.  attcacheoff is always initialized to -1, attacl,
- * attfdwoptions and attmissingval are always initialized to NULL.
+ * Caller has already opened and locked pg_attribute.  tupdesc contains the
+ * attributes to insert.  attcacheoff is always initialized to -1, attacl,
+ * attfdwoptions and attmissingval are always initialized to NULL.  attoptions
+ * must contain the same number of elements as tupdesc, or be NULL.
  *
  * indstate is the index state for CatalogTupleInsertWithInfo.  It can be
  * passed as NULL, in which case we'll fetch the necessary info.  (Don't do
  * this when inserting multiple attributes, because it's a tad more
  * expensive.)
+ *
+ * new_rel_oid is the relation OID assigned to the attributes inserted.
+ * If set to InvalidOid, the relation OID from tupdesc is used instead.
  */
 void
-InsertPgAttributeTuple(Relation pg_attribute_rel,
-					   Form_pg_attribute new_attribute,
-					   Datum attoptions,
-					   CatalogIndexState indstate)
+InsertPgAttributeTuples(Relation pg_attribute_rel,
+						TupleDesc tupdesc,
+						Oid new_rel_oid,
+						Datum *attoptions,
+						CatalogIndexState indstate)
 {
-	Datum		values[Natts_pg_attribute];
-	bool		nulls[Natts_pg_attribute];
-	HeapTuple	tup;
+	TupleTableSlot **slot;
+	TupleDesc	td;
+	int			nslots;
+	int			natts = 0;
+	int			slotCount = 0;
+	bool		close_index = false;
+
+	td = RelationGetDescr(pg_attribute_rel);
+
+	/* Initialize the number of slots to use */
+	nslots = Min(tupdesc->natts,
+				 (MAX_PGATTRIBUTE_INSERT_BYTES / sizeof(FormData_pg_attribute)));
+	slot = palloc(sizeof(TupleTableSlot *) * nslots);
+	for (int i = 0; i < nslots; i++)
+		slot[i] = MakeSingleTupleTableSlot(td, &TTSOpsHeapTuple);
+
+	while (natts < tupdesc->natts)
+	{
+		Form_pg_attribute attrs = TupleDescAttr(tupdesc, natts);
 
-	/* This is a tad tedious, but way cleaner than what we used to do... */
-	memset(values, 0, sizeof(values));
-	memset(nulls, false, sizeof(nulls));
+		ExecClearTuple(slot[slotCount]);
 
-	values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(new_attribute->attrelid);
-	values[Anum_pg_attribute_attname - 1] = NameGetDatum(&new_attribute->attname);
-	values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(new_attribute->atttypid);
-	values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(new_attribute->attstattarget);
-	values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(new_attribute->attlen);
-	values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(new_attribute->attnum);
-	values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(new_attribute->attndims);
-	values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1);
-	values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(new_attribute->atttypmod);
-	values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(new_attribute->attbyval);
-	values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(new_attribute->attstorage);
-	values[Anum_pg_attribute_attalign - 1] = CharGetDatum(new_attribute->attalign);
-	values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(new_attribute->attnotnull);
-	values[Anum_pg_attribute_atthasdef - 1] = BoolGetDatum(new_attribute->atthasdef);
-	values[Anum_pg_attribute_atthasmissing - 1] = BoolGetDatum(new_attribute->atthasmissing);
-	values[Anum_pg_attribute_attidentity - 1] = CharGetDatum(new_attribute->attidentity);
-	values[Anum_pg_attribute_attgenerated - 1] = CharGetDatum(new_attribute->attgenerated);
-	values[Anum_pg_attribute_attisdropped - 1] = BoolGetDatum(new_attribute->attisdropped);
-	values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(new_attribute->attislocal);
-	values[Anum_pg_attribute_attinhcount - 1] = Int32GetDatum(new_attribute->attinhcount);
-	values[Anum_pg_attribute_attcollation - 1] = ObjectIdGetDatum(new_attribute->attcollation);
-	values[Anum_pg_attribute_attoptions - 1] = attoptions;
-
-	/* start out with empty permissions and empty options */
-	nulls[Anum_pg_attribute_attacl - 1] = true;
-	nulls[Anum_pg_attribute_attoptions - 1] = attoptions == (Datum) 0;
-	nulls[Anum_pg_attribute_attfdwoptions - 1] = true;
-	nulls[Anum_pg_attribute_attmissingval - 1] = true;
-
-	tup = heap_form_tuple(RelationGetDescr(pg_attribute_rel), values, nulls);
+		if (new_rel_oid != InvalidOid)
+			slot[slotCount]->tts_values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(new_rel_oid);
+		else
+			slot[slotCount]->tts_values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(attrs->attrelid);
+
+		slot[slotCount]->tts_values[Anum_pg_attribute_attname - 1] = NameGetDatum(&attrs->attname);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(attrs->atttypid);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(attrs->attstattarget);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(attrs->attlen);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(attrs->attnum);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(attrs->attndims);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(attrs->atttypmod);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(attrs->attbyval);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(attrs->attstorage);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attalign - 1] = CharGetDatum(attrs->attalign);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(attrs->attnotnull);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atthasdef - 1] = BoolGetDatum(attrs->atthasdef);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atthasmissing - 1] = BoolGetDatum(attrs->atthasmissing);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attidentity - 1] = CharGetDatum(attrs->attidentity);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attgenerated - 1] = CharGetDatum(attrs->attgenerated);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attisdropped - 1] = BoolGetDatum(attrs->attisdropped);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(attrs->attislocal);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attinhcount - 1] = Int32GetDatum(attrs->attinhcount);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attcollation - 1] = ObjectIdGetDatum(attrs->attcollation);
+		if (attoptions && attoptions[natts] != (Datum) 0)
+			slot[slotCount]->tts_values[Anum_pg_attribute_attoptions - 1] = attoptions[natts];
+		else
+			slot[slotCount]->tts_isnull[Anum_pg_attribute_attoptions - 1] = true;
 
-	/* finally insert the new tuple, update the indexes, and clean up */
-	if (indstate != NULL)
-		CatalogTupleInsertWithInfo(pg_attribute_rel, tup, indstate);
-	else
-		CatalogTupleInsert(pg_attribute_rel, tup);
+		/* start out with empty permissions and empty options */
+		slot[slotCount]->tts_isnull[Anum_pg_attribute_attacl - 1] = true;
+		slot[slotCount]->tts_isnull[Anum_pg_attribute_attfdwoptions - 1] = true;
+		slot[slotCount]->tts_isnull[Anum_pg_attribute_attmissingval - 1] = true;
 
-	heap_freetuple(tup);
+		ExecStoreVirtualTuple(slot[slotCount]);
+		slotCount++;
+
+		/*
+		 * If slots are full or the end of processing has been reached, insert
+		 * a batch of tuples.
+		 */
+		if (slotCount == nslots || natts == tupdesc->natts - 1)
+		{
+			/* fetch index info only when we know we need it */
+			if (!indstate)
+			{
+				indstate = CatalogOpenIndexes(pg_attribute_rel);
+				close_index = true;
+			}
+
+			/* insert the new tuples and update the indexes */
+			CatalogTuplesMultiInsertWithInfo(pg_attribute_rel, slot, slotCount,
+											 indstate);
+			slotCount = 0;
+		}
+
+		natts++;
+	}
+
+	if (close_index)
+		CatalogCloseIndexes(indstate);
+	for (int i = 0; i < nslots; i++)
+		ExecDropSingleTupleTableSlot(slot[i]);
+	pfree(slot);
 }
 
 /* --------------------------------
@@ -788,8 +840,6 @@ AddNewAttributeTuples(Oid new_rel_oid,
 					  TupleDesc tupdesc,
 					  char relkind)
 {
-	Form_pg_attribute attr;
-	int			i;
 	Relation	rel;
 	CatalogIndexState indstate;
 	int			natts = tupdesc->natts;
@@ -803,30 +853,26 @@ AddNewAttributeTuples(Oid new_rel_oid,
 
 	indstate = CatalogOpenIndexes(rel);
 
-	/*
-	 * First we add the user attributes.  This is also a convenient place to
-	 * add dependencies on their datatypes and collations.
-	 */
-	for (i = 0; i < natts; i++)
-	{
-		attr = TupleDescAttr(tupdesc, i);
-		/* Fill in the correct relation OID */
-		attr->attrelid = new_rel_oid;
-		/* Make sure this is OK, too */
-		attr->attstattarget = -1;
-
-		InsertPgAttributeTuple(rel, attr, (Datum) 0, indstate);
+	/* set stats detail level to a sane default */
+	for (int i = 0; i < natts; i++)
+		tupdesc->attrs[i].attstattarget = -1;
+	InsertPgAttributeTuples(rel, tupdesc, new_rel_oid, NULL, indstate);
 
+	/* add dependencies on their datatypes and collations */
+	for (int i = 0; i < natts; i++)
+	{
 		/* Add dependency info */
 		ObjectAddressSubSet(myself, RelationRelationId, new_rel_oid, i + 1);
-		ObjectAddressSet(referenced, TypeRelationId, attr->atttypid);
+		ObjectAddressSet(referenced, TypeRelationId,
+						 tupdesc->attrs[i].atttypid);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 		/* The default collation is pinned, so don't bother recording it */
-		if (OidIsValid(attr->attcollation) &&
-			attr->attcollation != DEFAULT_COLLATION_OID)
+		if (OidIsValid(tupdesc->attrs[i].attcollation) &&
+			tupdesc->attrs[i].attcollation != DEFAULT_COLLATION_OID)
 		{
-			ObjectAddressSet(referenced, CollationRelationId, attr->attcollation);
+			ObjectAddressSet(referenced, CollationRelationId,
+							 tupdesc->attrs[i].attcollation);
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 		}
 	}
@@ -838,17 +884,12 @@ AddNewAttributeTuples(Oid new_rel_oid,
 	 */
 	if (relkind != RELKIND_VIEW && relkind != RELKIND_COMPOSITE_TYPE)
 	{
-		for (i = 0; i < (int) lengthof(SysAtt); i++)
-		{
-			FormData_pg_attribute attStruct;
+		TupleDesc	td;
 
-			memcpy(&attStruct, SysAtt[i], sizeof(FormData_pg_attribute));
+		td = CreateTupleDesc(lengthof(SysAtt), (FormData_pg_attribute **) &SysAtt);
 
-			/* Fill in the correct relation OID in the copied tuple */
-			attStruct.attrelid = new_rel_oid;
-
-			InsertPgAttributeTuple(rel, &attStruct, (Datum) 0, indstate);
-		}
+		InsertPgAttributeTuples(rel, td, new_rel_oid, NULL, indstate);
+		FreeTupleDesc(td);
 	}
 
 	/*
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 8ec2864c76a94..1be27eec52e6e 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -106,8 +106,7 @@ static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
 										  Oid *classObjectId);
 static void InitializeAttributeOids(Relation indexRelation,
 									int numatts, Oid indexoid);
-static void AppendAttributeTuples(Relation indexRelation, int numatts,
-								  Datum *attopts);
+static void AppendAttributeTuples(Relation indexRelation, Datum *attopts);
 static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
 								Oid parentIndexId,
 								IndexInfo *indexInfo,
@@ -485,12 +484,11 @@ InitializeAttributeOids(Relation indexRelation,
  * ----------------------------------------------------------------
  */
 static void
-AppendAttributeTuples(Relation indexRelation, int numatts, Datum *attopts)
+AppendAttributeTuples(Relation indexRelation, Datum *attopts)
 {
 	Relation	pg_attribute;
 	CatalogIndexState indstate;
 	TupleDesc	indexTupDesc;
-	int			i;
 
 	/*
 	 * open the attribute relation and its indexes
@@ -504,15 +502,7 @@ AppendAttributeTuples(Relation indexRelation, int numatts, Datum *attopts)
 	 */
 	indexTupDesc = RelationGetDescr(indexRelation);
 
-	for (i = 0; i < numatts; i++)
-	{
-		Form_pg_attribute attr = TupleDescAttr(indexTupDesc, i);
-		Datum		attoptions = attopts ? attopts[i] : (Datum) 0;
-
-		Assert(attr->attnum == i + 1);
-
-		InsertPgAttributeTuple(pg_attribute, attr, attoptions, indstate);
-	}
+	InsertPgAttributeTuples(pg_attribute, indexTupDesc, InvalidOid, attopts, indstate);
 
 	CatalogCloseIndexes(indstate);
 
@@ -979,8 +969,7 @@ index_create(Relation heapRelation,
 	/*
 	 * append ATTRIBUTE tuples for the index
 	 */
-	AppendAttributeTuples(indexRelation, indexInfo->ii_NumIndexAttrs,
-						  indexInfo->ii_OpclassOptions);
+	AppendAttributeTuples(indexRelation, indexInfo->ii_OpclassOptions);
 
 	/* ----------------
 	 *	  update pg_index
diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c
index fe277f3ad3712..538f6a06b872d 100644
--- a/src/backend/catalog/indexing.c
+++ b/src/backend/catalog/indexing.c
@@ -18,6 +18,7 @@
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/xact.h"
 #include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "executor/executor.h"
@@ -250,6 +251,41 @@ CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup,
 	CatalogIndexInsert(indstate, tup);
 }
 
+/*
+ * CatalogTuplesMultiInsertWithInfo - as above, but for multiple tuples
+ *
+ * Insert multiple tuples into the given catalog relation at once, with an
+ * amortized cost of CatalogOpenIndexes.
+ */
+void
+CatalogTuplesMultiInsertWithInfo(Relation heapRel, TupleTableSlot **slot,
+								 int ntuples, CatalogIndexState indstate)
+{
+	/* Nothing to do */
+	if (ntuples <= 0)
+		return;
+
+	heap_multi_insert(heapRel, slot, ntuples,
+					  GetCurrentCommandId(true), 0, NULL);
+
+	/*
+	 * There is no equivalent to heap_multi_insert for the catalog indexes, so
+	 * we must loop over and insert individually.
+	 */
+	for (int i = 0; i < ntuples; i++)
+	{
+		bool		should_free;
+		HeapTuple	tuple;
+
+		tuple = ExecFetchSlotHeapTuple(slot[i], true, &should_free);
+		tuple->t_tableOid = slot[i]->tts_tableOid;
+		CatalogIndexInsert(indstate, tuple);
+
+		if (should_free)
+			heap_freetuple(tuple);
+	}
+}
+
 /*
  * CatalogTupleUpdate - do heap and indexing work for updating a catalog tuple
  *
diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c
index 082b935a6984f..ef2b87927ceb5 100644
--- a/src/backend/catalog/pg_shdepend.c
+++ b/src/backend/catalog/pg_shdepend.c
@@ -785,6 +785,13 @@ checkSharedDependencies(Oid classId, Oid objectId,
 	return true;
 }
 
+
+/*
+ * Cap the maximum amount of bytes allocated for copyTemplateDependencies()
+ * slots.
+ */
+#define MAX_PGSHDEPEND_INSERT_BYTES 65535
+
 /*
  * copyTemplateDependencies
  *
@@ -799,14 +806,19 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	ScanKeyData key[1];
 	SysScanDesc scan;
 	HeapTuple	tup;
+	int			slotCount;
 	CatalogIndexState indstate;
-	Datum		values[Natts_pg_shdepend];
-	bool		nulls[Natts_pg_shdepend];
-	bool		replace[Natts_pg_shdepend];
+	TupleTableSlot **slot;
+	int			nslots;
 
 	sdepRel = table_open(SharedDependRelationId, RowExclusiveLock);
 	sdepDesc = RelationGetDescr(sdepRel);
 
+	nslots = MAX_PGSHDEPEND_INSERT_BYTES / sizeof(FormData_pg_shdepend);
+	slot = palloc(sizeof(TupleTableSlot *) * nslots);
+	for (int i = 0; i < nslots; i++)
+		slot[i] = MakeSingleTupleTableSlot(sdepDesc, &TTSOpsHeapTuple);
+
 	indstate = CatalogOpenIndexes(sdepRel);
 
 	/* Scan all entries with dbid = templateDbId */
@@ -818,14 +830,6 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	scan = systable_beginscan(sdepRel, SharedDependDependerIndexId, true,
 							  NULL, 1, key);
 
-	/* Set up to copy the tuples except for inserting newDbId */
-	memset(values, 0, sizeof(values));
-	memset(nulls, false, sizeof(nulls));
-	memset(replace, false, sizeof(replace));
-
-	replace[Anum_pg_shdepend_dbid - 1] = true;
-	values[Anum_pg_shdepend_dbid - 1] = ObjectIdGetDatum(newDbId);
-
 	/*
 	 * Copy the entries of the original database, changing the database Id to
 	 * that of the new database.  Note that because we are not copying rows
@@ -833,20 +837,46 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	 * copy the ownership dependency of the template database itself; this is
 	 * what we want.
 	 */
+	slotCount = 0;
 	while (HeapTupleIsValid(tup = systable_getnext(scan)))
 	{
-		HeapTuple	newtup;
+		Form_pg_shdepend shdep;
+
+		ExecClearTuple(slot[slotCount]);
+
+		shdep = (Form_pg_shdepend) GETSTRUCT(tup);
+
+		slot[slotCount]->tts_values[Anum_pg_shdepend_dbid] = ObjectIdGetDatum(newDbId);
+		slot[slotCount]->tts_values[Anum_pg_shdepend_classid] = shdep->classid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_objid] = shdep->objid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_objsubid] = shdep->objsubid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_refclassid] = shdep->refclassid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_refobjid] = shdep->refobjid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_deptype] = shdep->deptype;
 
-		newtup = heap_modify_tuple(tup, sdepDesc, values, nulls, replace);
-		CatalogTupleInsertWithInfo(sdepRel, newtup, indstate);
+		ExecStoreVirtualTuple(slot[slotCount]);
+		slotCount++;
 
-		heap_freetuple(newtup);
+		/* If slots are full, insert a batch of tuples */
+		if (slotCount == nslots)
+		{
+			CatalogTuplesMultiInsertWithInfo(sdepRel, slot, slotCount, indstate);
+			slotCount = 0;
+		}
 	}
 
+	/* Insert any tuples left in the buffer */
+	if (slotCount > 0)
+		CatalogTuplesMultiInsertWithInfo(sdepRel, slot, slotCount, indstate);
+
 	systable_endscan(scan);
 
 	CatalogCloseIndexes(indstate);
 	table_close(sdepRel, RowExclusiveLock);
+
+	for (int i = 0; i < nslots; i++)
+		ExecDropSingleTupleTableSlot(slot[i]);
+	pfree(slot);
 }
 
 /*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 27b596cb5912c..ac53f79ada2a6 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -5975,6 +5975,8 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel,
 	AlterTableCmd *childcmd;
 	AclResult	aclresult;
 	ObjectAddress address;
+	TupleDesc	tupdesc;
+	FormData_pg_attribute *aattr[] = {&attribute};
 
 	/* At top level, permission check was done in ATPrepCmd, else do it */
 	if (recursing)
@@ -6128,11 +6130,13 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel,
 	attribute.attislocal = colDef->is_local;
 	attribute.attinhcount = colDef->inhcount;
 	attribute.attcollation = collOid;
-	/* attribute.attacl is handled by InsertPgAttributeTuple */
+	/* attribute.attacl is handled by InsertPgAttributeTuples() */
 
 	ReleaseSysCache(typeTuple);
 
-	InsertPgAttributeTuple(attrdesc, &attribute, (Datum) 0, NULL);
+	tupdesc = CreateTupleDesc(lengthof(aattr), (FormData_pg_attribute **) &aattr);
+
+	InsertPgAttributeTuples(attrdesc, tupdesc, myrelid, NULL, NULL);
 
 	table_close(attrdesc, RowExclusiveLock);
 
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h
index cbfdfe2abe5ef..d31141c1a218f 100644
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -93,10 +93,11 @@ extern void heap_truncate_check_FKs(List *relations, bool tempTables);
 
 extern List *heap_truncate_find_FKs(List *relationIds);
 
-extern void InsertPgAttributeTuple(Relation pg_attribute_rel,
-								   Form_pg_attribute new_attribute,
-								   Datum attoptions,
-								   CatalogIndexState indstate);
+extern void InsertPgAttributeTuples(Relation pg_attribute_rel,
+									TupleDesc tupdesc,
+									Oid new_rel_oid,
+									Datum *attoptions,
+									CatalogIndexState indstate);
 
 extern void InsertPgClassTuple(Relation pg_class_desc,
 							   Relation new_rel_desc,
diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h
index 8be303870f886..a7e2a9b26b465 100644
--- a/src/include/catalog/indexing.h
+++ b/src/include/catalog/indexing.h
@@ -19,6 +19,7 @@
 #define INDEXING_H
 
 #include "access/htup.h"
+#include "nodes/execnodes.h"
 #include "utils/relcache.h"
 
 /*
@@ -36,6 +37,10 @@ extern void CatalogCloseIndexes(CatalogIndexState indstate);
 extern void CatalogTupleInsert(Relation heapRel, HeapTuple tup);
 extern void CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup,
 									   CatalogIndexState indstate);
+extern void CatalogTuplesMultiInsertWithInfo(Relation heapRel,
+											 TupleTableSlot **slot,
+											 int ntuples,
+											 CatalogIndexState indstate);
 extern void CatalogTupleUpdate(Relation heapRel, ItemPointer otid,
 							   HeapTuple tup);
 extern void CatalogTupleUpdateWithInfo(Relation heapRel,

From c5315f4f44843c20ada876fdb0d0828795dfbdf5 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 31 Jul 2020 14:15:18 +1200
Subject: [PATCH 42/54] Cache smgrnblocks() results in recovery.

Avoid repeatedly calling lseek(SEEK_END) during recovery by caching
the size of each fork.  For now, we can't use the same technique in
other processes, because we lack a shared invalidation mechanism.

Do this by generalizing the pre-existing caching used by FSM and VM
to support all forks.

Discussion: https://postgr.es/m/CAEepm%3D3SSw-Ty1DFcK%3D1rU-K6GSzYzfdD4d%2BZwapdN7dTa6%3DnQ%40mail.gmail.com
---
 contrib/pg_visibility/pg_visibility.c     |  2 +-
 src/backend/access/heap/visibilitymap.c   | 18 ++++-----
 src/backend/catalog/storage.c             |  4 +-
 src/backend/storage/freespace/freespace.c | 27 +++++++------
 src/backend/storage/smgr/smgr.c           | 49 +++++++++++++++++------
 src/include/storage/smgr.h                | 12 +++---
 6 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 68d580ed1e02c..e731161734ae2 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -392,7 +392,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
 	check_relation_relkind(rel);
 
 	RelationOpenSmgr(rel);
-	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
+	rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
 
 	block = visibilitymap_prepare_truncate(rel, 0);
 	if (BlockNumberIsValid(block))
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 0a51678c40df1..b1072183bcd6d 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -561,17 +561,16 @@ vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
 	 * If we haven't cached the size of the visibility map fork yet, check it
 	 * first.
 	 */
-	if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber)
+	if (rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber)
 	{
 		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
-			rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
-														VISIBILITYMAP_FORKNUM);
+			smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
 		else
-			rel->rd_smgr->smgr_vm_nblocks = 0;
+			rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0;
 	}
 
 	/* Handle requests beyond EOF */
-	if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
+	if (blkno >= rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM])
 	{
 		if (extend)
 			vm_extend(rel, blkno + 1);
@@ -641,11 +640,13 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 * Create the file first if it doesn't exist.  If smgr_vm_nblocks is
 	 * positive then it must exist, no need for an smgrexists call.
 	 */
-	if ((rel->rd_smgr->smgr_vm_nblocks == 0 ||
-		 rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) &&
+	if ((rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == 0 ||
+		 rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) &&
 		!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
 		smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
 
+	/* Invalidate cache so that smgrnblocks() asks the kernel. */
+	rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
 	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
 
 	/* Now extend the file */
@@ -667,8 +668,5 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 */
 	CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode);
 
-	/* Update local cache with the up-to-date size */
-	rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;
-
 	UnlockRelationForExtension(rel, ExclusiveLock);
 }
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index ec143b640aabf..9e6e6c42d3c83 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -290,8 +290,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	 * Make sure smgr_targblock etc aren't pointing somewhere past new end
 	 */
 	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
-	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
-	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
+	for (int i = 0; i <= MAX_FORKNUM; ++i)
+		rel->rd_smgr->smgr_cached_nblocks[i] = InvalidBlockNumber;
 
 	/* Prepare for truncation of MAIN fork of the relation */
 	forks[nforks] = MAIN_FORKNUM;
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 95a21f6cc383e..6a96126b0c2ff 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -541,18 +541,19 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
 	 * value might be stale.  (We send smgr inval messages on truncation, but
 	 * not on extension.)
 	 */
-	if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
-		blkno >= rel->rd_smgr->smgr_fsm_nblocks)
+	if (rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
+		blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
 	{
+		/* Invalidate the cache so smgrnblocks asks the kernel. */
+		rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
 		if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
-			rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
-														 FSM_FORKNUM);
+			smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
 		else
-			rel->rd_smgr->smgr_fsm_nblocks = 0;
+			rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = 0;
 	}
 
 	/* Handle requests beyond EOF */
-	if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
+	if (blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
 	{
 		if (extend)
 			fsm_extend(rel, blkno + 1);
@@ -621,14 +622,17 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	RelationOpenSmgr(rel);
 
 	/*
-	 * Create the FSM file first if it doesn't exist.  If smgr_fsm_nblocks is
-	 * positive then it must exist, no need for an smgrexists call.
+	 * Create the FSM file first if it doesn't exist.  If
+	 * smgr_cached_nblocks[FSM_FORKNUM] is positive then it must exist, no
+	 * need for an smgrexists call.
 	 */
-	if ((rel->rd_smgr->smgr_fsm_nblocks == 0 ||
-		 rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) &&
+	if ((rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == 0 ||
+		 rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber) &&
 		!smgrexists(rel->rd_smgr, FSM_FORKNUM))
 		smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
 
+	/* Invalidate cache so that smgrnblocks() asks the kernel. */
+	rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
 	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
 
 	while (fsm_nblocks_now < fsm_nblocks)
@@ -640,9 +644,6 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 		fsm_nblocks_now++;
 	}
 
-	/* Update local cache with the up-to-date size */
-	rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;
-
 	UnlockRelationForExtension(rel, ExclusiveLock);
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 7d667c6586f1f..dcc09df0c7723 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -17,6 +17,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
@@ -174,8 +175,8 @@ smgropen(RelFileNode rnode, BackendId backend)
 		/* hash_search already filled in the lookup key */
 		reln->smgr_owner = NULL;
 		reln->smgr_targblock = InvalidBlockNumber;
-		reln->smgr_fsm_nblocks = InvalidBlockNumber;
-		reln->smgr_vm_nblocks = InvalidBlockNumber;
+		for (int i = 0; i <= MAX_FORKNUM; ++i)
+			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
 		reln->smgr_which = 0;	/* we only have md.c at present */
 
 		/* implementation-specific initialization */
@@ -464,6 +465,16 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 {
 	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
 										 buffer, skipFsync);
+
+	/*
+	 * Normally we expect this to increase nblocks by one, but if the cached
+	 * value isn't as expected, just invalidate it so the next call asks the
+	 * kernel.
+	 */
+	if (reln->smgr_cached_nblocks[forknum] == blocknum)
+		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
+	else
+		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 }
 
 /*
@@ -537,7 +548,20 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 BlockNumber
 smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	return smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+	BlockNumber result;
+
+	/*
+	 * For now, we only use cached values in recovery due to lack of a shared
+	 * invalidation mechanism for changes in file size.
+	 */
+	if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
+		return reln->smgr_cached_nblocks[forknum];
+
+	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+
+	reln->smgr_cached_nblocks[forknum] = result;
+
+	return result;
 }
 
 /*
@@ -576,20 +600,19 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 	/* Do the truncation */
 	for (i = 0; i < nforks; i++)
 	{
+		/* Make the cached size is invalid if we encounter an error. */
+		reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
+
 		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
 
 		/*
-		 * We might as well update the local smgr_fsm_nblocks and
-		 * smgr_vm_nblocks settings. The smgr cache inval message that this
-		 * function sent will cause other backends to invalidate their copies
-		 * of smgr_fsm_nblocks and smgr_vm_nblocks, and these ones too at the
-		 * next command boundary. But these ensure they aren't outright wrong
-		 * until then.
+		 * We might as well update the local smgr_cached_nblocks values. The
+		 * smgr cache inval message that this function sent will cause other
+		 * backends to invalidate their copies of smgr_fsm_nblocks and
+		 * smgr_vm_nblocks, and these ones too at the next command boundary.
+		 * But these ensure they aren't outright wrong until then.
 		 */
-		if (forknum[i] == FSM_FORKNUM)
-			reln->smgr_fsm_nblocks = nblocks[i];
-		if (forknum[i] == VISIBILITYMAP_FORKNUM)
-			reln->smgr_vm_nblocks = nblocks[i];
+		reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
 	}
 }
 
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 656665959319e..f28a842401326 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -45,15 +45,13 @@ typedef struct SMgrRelationData
 	struct SMgrRelationData **smgr_owner;
 
 	/*
-	 * These next three fields are not actually used or manipulated by smgr,
-	 * except that they are reset to InvalidBlockNumber upon a cache flush
-	 * event (in particular, upon truncation of the relation).  Higher levels
-	 * store cached state here so that it will be reset when truncation
-	 * happens.  In all three cases, InvalidBlockNumber means "unknown".
+	 * The following fields are reset to InvalidBlockNumber upon a cache flush
+	 * event, and hold the last known size for each fork.  This information is
+	 * currently only reliable during recovery, since there is no cache
+	 * invalidation for fork extension.
 	 */
 	BlockNumber smgr_targblock; /* current insertion target block */
-	BlockNumber smgr_fsm_nblocks;	/* last known size of fsm fork */
-	BlockNumber smgr_vm_nblocks;	/* last known size of vm fork */
+	BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1];	/* last known size */
 
 	/* additional public fields may someday exist here */
 

From 7b1110d2fd3da3d7536530d14952d4f4d9c25438 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 31 Jul 2020 14:17:29 +0900
Subject: [PATCH 43/54] Fix comment in instrument.h

local_blks_dirtied tracks the number of local blocks dirtied, not shared
ones.

Author: Kirk Jamison
Discussion: https://postgr.es/m/OSBPR01MB2341760686DC056DE89D2AB9EF710@OSBPR01MB2341.jpnprd01.prod.outlook.com
---
 src/include/executor/instrument.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index a97562e7a4fea..9dc3ecb07d79b 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -24,7 +24,7 @@ typedef struct BufferUsage
 	long		shared_blks_written;	/* # of shared disk blocks written */
 	long		local_blks_hit; /* # of local buffer hits */
 	long		local_blks_read;	/* # of local disk blocks read */
-	long		local_blks_dirtied; /* # of shared blocks dirtied */
+	long		local_blks_dirtied; /* # of local blocks dirtied */
 	long		local_blks_written; /* # of local disk blocks written */
 	long		temp_blks_read; /* # of temp blocks read */
 	long		temp_blks_written;	/* # of temp blocks written */

From 84b1c63ad41872792d47e523363fce1f0e230022 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 31 Jul 2020 17:27:09 +1200
Subject: [PATCH 44/54] Preallocate some DSM space at startup.

Create an optional region in the main shared memory segment that can be
used to acquire and release "fast" DSM segments, and can benefit from
huge pages allocated at cluster startup time, if configured.  Fall back
to the existing mechanisms when that space is full.  The size is
controlled by a new GUC min_dynamic_shared_memory, defaulting to 0.

Main region DSM segments initially contain whatever garbage the memory
held last time they were used, rather than zeroes.  That change revealed
that DSA areas failed to initialize themselves correctly in memory that
wasn't zeroed first, so fix that problem.

Discussion: https://postgr.es/m/CA%2BhUKGLAE2QBv-WgGp%2BD9P_J-%3Dyne3zof9nfMaqq1h3EGHFXYQ%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      |  24 +++
 src/backend/storage/ipc/dsm.c                 | 191 ++++++++++++++++--
 src/backend/storage/ipc/dsm_impl.c            |   3 +
 src/backend/storage/ipc/ipci.c                |   3 +
 src/backend/utils/misc/guc.c                  |  11 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/backend/utils/mmgr/dsa.c                  |   5 +-
 src/include/storage/dsm.h                     |   3 +
 src/include/storage/dsm_impl.h                |   1 +
 9 files changed, 216 insertions(+), 26 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 427947cf4962e..994155ca00e22 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1906,6 +1906,30 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-min-dynamic-shared-memory" xreflabel="min_dynamic_shared_memory">
+      <term><varname>min_dynamic_shared_memory</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>min_dynamic_shared_memory</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of memory that should be allocated at server
+        startup time for use by parallel queries.  When this memory region is
+        insufficient or exhausted by concurrent queries, new parallel queries
+        try to allocate extra shared memory temporarily from the operating
+        system using the method configured with
+        <varname>dynamic_shared_memory_type</varname>, which may be slower due
+        to memory management overheads.  Memory that is allocated at startup
+        time with <varname>min_dynamic_shared_memory</varname> is affected by
+        the <varname>huge_pages</varname> setting on operating systems where
+        that is supported, and may be more likely to benefit from larger pages
+        on operating systems where that is managed automatically.
+        The default value is <literal>0</literal> (none).
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
 
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
index ef64d083570a4..dffbd8e82a2a2 100644
--- a/src/backend/storage/ipc/dsm.c
+++ b/src/backend/storage/ipc/dsm.c
@@ -35,10 +35,12 @@
 
 #include "lib/ilist.h"
 #include "miscadmin.h"
+#include "port/pg_bitutils.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
+#include "utils/freepage.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
@@ -76,6 +78,8 @@ typedef struct dsm_control_item
 {
 	dsm_handle	handle;
 	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
+	size_t		first_page;
+	size_t		npages;
 	void	   *impl_private_pm_handle; /* only needed on Windows */
 	bool		pinned;
 } dsm_control_item;
@@ -95,10 +99,15 @@ static dsm_segment *dsm_create_descriptor(void);
 static bool dsm_control_segment_sane(dsm_control_header *control,
 									 Size mapped_size);
 static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
 
 /* Has this backend initialized the dynamic shared memory system yet? */
 static bool dsm_init_done = false;
 
+/* Preallocated DSM space in the main shared memory region. */
+static void *dsm_main_space_begin = NULL;
+
 /*
  * List of dynamic shared memory segments used by this backend.
  *
@@ -171,7 +180,7 @@ dsm_postmaster_startup(PGShmemHeader *shim)
 	{
 		Assert(dsm_control_address == NULL);
 		Assert(dsm_control_mapped_size == 0);
-		dsm_control_handle = random();
+		dsm_control_handle = random() << 1; /* Even numbers only */
 		if (dsm_control_handle == DSM_HANDLE_INVALID)
 			continue;
 		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
@@ -247,8 +256,12 @@ dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
 		if (refcnt == 0)
 			continue;
 
-		/* Log debugging information. */
+		/* If it was using the main shmem area, there is nothing to do. */
 		handle = old_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
 			 handle, refcnt);
 
@@ -348,8 +361,11 @@ dsm_postmaster_shutdown(int code, Datum arg)
 		if (dsm_control->item[i].refcnt == 0)
 			continue;
 
-		/* Log debugging information. */
 		handle = dsm_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
 			 handle);
 
@@ -418,6 +434,45 @@ dsm_set_control_handle(dsm_handle h)
 }
 #endif
 
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+	return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+	size_t		size = dsm_estimate_size();
+	bool		found;
+
+	if (size == 0)
+		return;
+
+	dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+	if (!found)
+	{
+		FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+		size_t		first_page = 0;
+		size_t		pages;
+
+		/* Reserve space for the FreePageManager. */
+		while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+			++first_page;
+
+		/* Initialize it and give it all the rest of the space. */
+		FreePageManagerInitialize(fpm, dsm_main_space_begin);
+		pages = (size / FPM_PAGE_SIZE) - first_page;
+		FreePageManagerPut(fpm, first_page, pages);
+	}
+}
+
 /*
  * Create a new dynamic shared memory segment.
  *
@@ -434,6 +489,10 @@ dsm_create(Size size, int flags)
 	dsm_segment *seg;
 	uint32		i;
 	uint32		nitems;
+	size_t		npages = 0;
+	size_t		first_page = 0;
+	FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+	bool		using_main_dsm_region = false;
 
 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
 	Assert(IsUnderPostmaster);
@@ -444,20 +503,48 @@ dsm_create(Size size, int flags)
 	/* Create a new segment descriptor. */
 	seg = dsm_create_descriptor();
 
-	/* Loop until we find an unused segment identifier. */
-	for (;;)
+	/*
+	 * Lock the control segment while we try to allocate from the main shared
+	 * memory area, if configured.
+	 */
+	if (dsm_main_space_fpm)
 	{
-		Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
-		seg->handle = random();
-		if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
-			continue;
-		if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
-						&seg->mapped_address, &seg->mapped_size, ERROR))
-			break;
+		npages = size / FPM_PAGE_SIZE;
+		if (size % FPM_PAGE_SIZE > 0)
+			++npages;
+
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+		if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+		{
+			/* We can carve out a piece of the main shared memory segment. */
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = npages * FPM_PAGE_SIZE;
+			using_main_dsm_region = true;
+			/* We'll choose a handle below. */
+		}
 	}
 
-	/* Lock the control segment so we can register the new segment. */
-	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	if (!using_main_dsm_region)
+	{
+		/*
+		 * We need to create a new memory segment.  Loop until we find an
+		 * unused segment identifier.
+		 */
+		if (dsm_main_space_fpm)
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		for (;;)
+		{
+			Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+			seg->handle = random() << 1;	/* Even numbers only */
+			if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
+				continue;
+			if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+							&seg->mapped_address, &seg->mapped_size, ERROR))
+				break;
+		}
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	}
 
 	/* Search the control segment for an unused slot. */
 	nitems = dsm_control->nitems;
@@ -465,6 +552,14 @@ dsm_create(Size size, int flags)
 	{
 		if (dsm_control->item[i].refcnt == 0)
 		{
+			if (using_main_dsm_region)
+			{
+				seg->handle = make_main_region_dsm_handle(i);
+				dsm_control->item[i].first_page = first_page;
+				dsm_control->item[i].npages = npages;
+			}
+			else
+				Assert(!is_main_region_dsm_handle(seg->handle));
 			dsm_control->item[i].handle = seg->handle;
 			/* refcnt of 1 triggers destruction, so start at 2 */
 			dsm_control->item[i].refcnt = 2;
@@ -479,9 +574,12 @@ dsm_create(Size size, int flags)
 	/* Verify that we can support an additional mapping. */
 	if (nitems >= dsm_control->maxitems)
 	{
+		if (using_main_dsm_region)
+			FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
 		LWLockRelease(DynamicSharedMemoryControlLock);
-		dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
-					&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (!using_main_dsm_region)
+			dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
 		if (seg->resowner != NULL)
 			ResourceOwnerForgetDSM(seg->resowner, seg);
 		dlist_delete(&seg->node);
@@ -495,6 +593,12 @@ dsm_create(Size size, int flags)
 	}
 
 	/* Enter the handle into a new array slot. */
+	if (using_main_dsm_region)
+	{
+		seg->handle = make_main_region_dsm_handle(nitems);
+		dsm_control->item[i].first_page = first_page;
+		dsm_control->item[i].npages = npages;
+	}
 	dsm_control->item[nitems].handle = seg->handle;
 	/* refcnt of 1 triggers destruction, so start at 2 */
 	dsm_control->item[nitems].refcnt = 2;
@@ -580,6 +684,12 @@ dsm_attach(dsm_handle h)
 		/* Otherwise we've found a match. */
 		dsm_control->item[i].refcnt++;
 		seg->control_slot = i;
+		if (is_main_region_dsm_handle(seg->handle))
+		{
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+		}
 		break;
 	}
 	LWLockRelease(DynamicSharedMemoryControlLock);
@@ -597,8 +707,9 @@ dsm_attach(dsm_handle h)
 	}
 
 	/* Here's where we actually try to map the segment. */
-	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
-				&seg->mapped_address, &seg->mapped_size, ERROR);
+	if (!is_main_region_dsm_handle(seg->handle))
+		dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+					&seg->mapped_address, &seg->mapped_size, ERROR);
 
 	return seg;
 }
@@ -688,8 +799,9 @@ dsm_detach(dsm_segment *seg)
 	 */
 	if (seg->mapped_address != NULL)
 	{
-		dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
-					&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (!is_main_region_dsm_handle(seg->handle))
+			dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
 		seg->impl_private = NULL;
 		seg->mapped_address = NULL;
 		seg->mapped_size = 0;
@@ -729,10 +841,15 @@ dsm_detach(dsm_segment *seg)
 			 * other reason, the postmaster may not have any better luck than
 			 * we did.  There's not much we can do about that, though.
 			 */
-			if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+			if (is_main_region_dsm_handle(seg->handle) ||
+				dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
 							&seg->mapped_address, &seg->mapped_size, WARNING))
 			{
 				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+				if (is_main_region_dsm_handle(seg->handle))
+					FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+									   dsm_control->item[control_slot].first_page,
+									   dsm_control->item[control_slot].npages);
 				Assert(dsm_control->item[control_slot].handle == seg->handle);
 				Assert(dsm_control->item[control_slot].refcnt == 1);
 				dsm_control->item[control_slot].refcnt = 0;
@@ -894,10 +1011,15 @@ dsm_unpin_segment(dsm_handle handle)
 		 * pass the mapped size, mapped address, and private data as NULL
 		 * here.
 		 */
-		if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+		if (is_main_region_dsm_handle(handle) ||
+			dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
 						&junk_mapped_address, &junk_mapped_size, WARNING))
 		{
 			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+			if (is_main_region_dsm_handle(handle))
+				FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+								   dsm_control->item[control_slot].first_page,
+								   dsm_control->item[control_slot].npages);
 			Assert(dsm_control->item[control_slot].handle == handle);
 			Assert(dsm_control->item[control_slot].refcnt == 1);
 			dsm_control->item[control_slot].refcnt = 0;
@@ -1094,3 +1216,28 @@ dsm_control_bytes_needed(uint32 nitems)
 	return offsetof(dsm_control_header, item)
 		+ sizeof(dsm_control_item) * (uint64) nitems;
 }
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+	dsm_handle	handle;
+
+	/*
+	 * We need to create a handle that doesn't collide with any existing extra
+	 * segment created by dsm_impl_op(), so we'll make it odd.  It also
+	 * mustn't collide with any other main area pseudo-segment, so we'll
+	 * include the slot number in some of the bits.  We also want to make an
+	 * effort to avoid newly created and recently destroyed handles from being
+	 * confused, so we'll make the rest of the bits random.
+	 */
+	handle = 1;
+	handle |= slot << 1;
+	handle |= random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
+	return handle;
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+	return handle & 1;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 1972aecbedc1a..d4306418dcb24 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -113,6 +113,9 @@ const struct config_enum_entry dynamic_shared_memory_options[] = {
 /* Implementation selector. */
 int			dynamic_shared_memory_type;
 
+/* Amount of space reserved for DSM segments in the main area. */
+int			min_dynamic_shared_memory;
+
 /* Size of buffer to be used for zero-filling. */
 #define ZBUFFER_SIZE				8192
 
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index e850ebd131e3f..96c2aaabbd65c 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SpinlockSemaSize());
 		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
 												 sizeof(ShmemIndexEnt)));
+		size = add_size(size, dsm_estimate_size());
 		size = add_size(size, BufferShmemSize());
 		size = add_size(size, LockShmemSize());
 		size = add_size(size, PredicateLockShmemSize());
@@ -209,6 +210,8 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	InitShmemIndex();
 
+	dsm_shmem_init();
+
 	/*
 	 * Set up xlog, clog, and buffers
 	 */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c20885e97b203..6c6bb22014932 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2231,6 +2231,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"min_dynamic_shared_memory", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Amount of dynamic shared memory reserved at startup."),
+			NULL,
+			GUC_UNIT_MB
+		},
+		&min_dynamic_shared_memory,
+		0, 0, Min(INT_MAX, SIZE_MAX / 1024 / 1024),
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * We sometimes multiply the number of shared buffers by two without
 	 * checking for overflow, so we mustn't allow more than INT_MAX / 2.
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index aa30291ea3964..b0715ae188180 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -148,6 +148,7 @@
 					#   windows
 					#   mmap
 					# (change requires restart)
+#min_dynamic_shared_memory = 0MB	# (change requires restart)
 
 # - Disk -
 
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index b7ad8e62ef3f4..6e5e412429789 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -1223,6 +1223,7 @@ create_internal(void *place, size_t size,
 	 * space.
 	 */
 	control = (dsa_area_control *) place;
+	memset(place, 0, sizeof(*control));
 	control->segment_header.magic =
 		DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
 	control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
@@ -1233,14 +1234,10 @@ create_internal(void *place, size_t size,
 	control->handle = control_handle;
 	control->max_total_segment_size = (size_t) -1;
 	control->total_segment_size = size;
-	memset(&control->segment_handles[0], 0,
-		   sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
 	control->segment_handles[0] = control_handle;
 	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
 		control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
-	control->high_segment_index = 0;
 	control->refcnt = 1;
-	control->freed_segment_counter = 0;
 	control->lwlock_tranche_id = tranche_id;
 
 	/*
diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h
index 408c0543a6354..0455576f4af4c 100644
--- a/src/include/storage/dsm.h
+++ b/src/include/storage/dsm.h
@@ -29,6 +29,9 @@ extern void dsm_postmaster_startup(struct PGShmemHeader *);
 extern void dsm_backend_shutdown(void);
 extern void dsm_detach_all(void);
 
+extern size_t dsm_estimate_size(void);
+extern void dsm_shmem_init(void);
+
 #ifdef EXEC_BACKEND
 extern void dsm_set_control_handle(dsm_handle h);
 #endif
diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h
index 562cb781a812c..f6841e2534f96 100644
--- a/src/include/storage/dsm_impl.h
+++ b/src/include/storage/dsm_impl.h
@@ -40,6 +40,7 @@
 
 /* GUC. */
 extern int	dynamic_shared_memory_type;
+extern int	min_dynamic_shared_memory;
 
 /*
  * Directory for on-disk state.

From 7be04496a9f763fc4d4c1d06ce9ccc250e52df31 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 31 Jul 2020 19:08:09 +1200
Subject: [PATCH 45/54] Fix compiler warning from Clang.

Per build farm.

Discussion: https://postgr.es/m/20200731062626.GD3317%40paquier.xyz
---
 src/backend/utils/misc/guc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6c6bb22014932..de87ad6ef7028 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2238,7 +2238,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_MB
 		},
 		&min_dynamic_shared_memory,
-		0, 0, Min(INT_MAX, SIZE_MAX / 1024 / 1024),
+		0, 0, (int) Min((size_t) INT_MAX, SIZE_MAX / (1024 * 1024)),
 		NULL, NULL, NULL
 	},
 

From 78e73e87548a1e0b71b6f2425f76ea6e9c85b2eb Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 31 Jul 2020 11:43:12 -0400
Subject: [PATCH 46/54] Fix recently-introduced performance problem in
 ts_headline().

The new hlCover() algorithm that I introduced in commit c9b0c678d
turns out to potentially take O(N^2) or worse time on long documents,
if there are many occurrences of individual query words but few or no
substrings that actually satisfy the query.  (One way to hit this
behavior is with a "common_word & rare_word" type of query.)  This
seems unavoidable given the original goal of checking every substring
of the document, so we have to back off that idea.  Fortunately, it
seems unlikely that anyone would really want headlines spanning all of
a long document, so we can avoid the worse-than-linear behavior by
imposing a maximum length of substring that we'll consider.

For now, just hard-wire that maximum length as a multiple of max_words
times max_fragments.  Perhaps at some point somebody will argue for
exposing it as a ts_headline parameter, but I'm hesitant to make such
a feature addition in a back-patched bug fix.

I also noted that the hlFirstIndex() function I'd added in that
commit was unnecessarily stupid: it really only needs to check whether
a HeadlineWordEntry's item pointer is null or not.  This wouldn't make
all that much difference in typical cases with queries having just
a few terms, but a cycle shaved is a cycle earned.

In addition, add a CHECK_FOR_INTERRUPTS call in TS_execute_recurse.
This ensures that hlCover's loop is cancellable if it manages to take
a long time, and it may protect some other TS_execute callers as well.

Back-patch to 9.6 as the previous commit was.  I also chose to add the
CHECK_FOR_INTERRUPTS call to 9.5.  The old hlCover() algorithm seems
to avoid the O(N^2) behavior, at least on the test case I tried, but
nonetheless it's not very quick on a long document.

Per report from Stephen Frost.

Discussion: https://postgr.es/m/20200724160535.GW12375@tamriel.snowman.net
---
 src/backend/tsearch/wparser_def.c   | 57 ++++++++++++++++-------------
 src/backend/utils/adt/tsvector_op.c |  3 ++
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 76b6f9aef03ca..7b29062a97eaa 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -2003,24 +2003,14 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
  * Returns -1 if no such index
  */
 static int
-hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
+hlFirstIndex(HeadlineParsedText *prs, int pos)
 {
 	int			i;
 
-	/* For each word ... */
 	for (i = pos; i < prs->curwords; i++)
 	{
-		/* ... scan the query to see if this word matches any operand */
-		QueryItem  *item = GETQUERY(query);
-		int			j;
-
-		for (j = 0; j < query->size; j++)
-		{
-			if (item->type == QI_VAL &&
-				prs->words[i].item == &item->qoperand)
-				return i;
-			item++;
-		}
+		if (prs->words[i].item != NULL)
+			return i;
 	}
 	return -1;
 }
@@ -2028,8 +2018,14 @@ hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
 /*
  * hlCover: try to find a substring of prs' word list that satisfies query
  *
- * At entry, *p must be the first word index to consider (initialize this to
- * zero, or to the next index after a previous successful search).
+ * At entry, *p must be the first word index to consider (initialize this
+ * to zero, or to the next index after a previous successful search).
+ * We will consider all substrings starting at or after that word, and
+ * containing no more than max_cover words.  (We need a length limit to
+ * keep this from taking O(N^2) time for a long document with many query
+ * words but few complete matches.  Actually, since checkcondition_HL is
+ * roughly O(N) in the length of the substring being checked, it's even
+ * worse than that.)
  *
  * On success, sets *p to first word index and *q to last word index of the
  * cover substring, and returns true.
@@ -2038,7 +2034,8 @@ hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
  * words used in the query.
  */
 static bool
-hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
+hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
+		int *p, int *q)
 {
 	int			pmin,
 				pmax,
@@ -2052,7 +2049,7 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 	 * appearing in the query; there's no point in trying endpoints in between
 	 * such points.
 	 */
-	pmin = hlFirstIndex(prs, query, *p);
+	pmin = hlFirstIndex(prs, *p);
 	while (pmin >= 0)
 	{
 		/* This useless assignment just keeps stupider compilers quiet */
@@ -2073,7 +2070,7 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 				return true;
 			}
 			/* Nope, so advance pmax to next feasible endpoint */
-			nextpmax = hlFirstIndex(prs, query, pmax + 1);
+			nextpmax = hlFirstIndex(prs, pmax + 1);
 
 			/*
 			 * If this is our first advance past pmin, then the result is also
@@ -2084,7 +2081,7 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 				nextpmin = nextpmax;
 			pmax = nextpmax;
 		}
-		while (pmax >= 0);
+		while (pmax >= 0 && pmax - pmin < max_cover);
 		/* No luck here, so try next feasible startpoint */
 		pmin = nextpmin;
 	}
@@ -2186,7 +2183,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 static void
 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 				  int shortword, int min_words,
-				  int max_words, int max_fragments)
+				  int max_words, int max_fragments, int max_cover)
 {
 	int32		poslen,
 				curlen,
@@ -2213,7 +2210,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 	covers = palloc(maxcovers * sizeof(CoverPos));
 
 	/* get all covers */
-	while (hlCover(prs, query, &p, &q))
+	while (hlCover(prs, query, max_cover, &p, &q))
 	{
 		startpos = p;
 		endpos = q;
@@ -2368,7 +2365,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
  */
 static void
 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
-			  int shortword, int min_words, int max_words)
+			  int shortword, int min_words, int max_words, int max_cover)
 {
 	int			p = 0,
 				q = 0;
@@ -2386,7 +2383,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 	if (!highlightall)
 	{
 		/* examine all covers, select a headline using the best one */
-		while (hlCover(prs, query, &p, &q))
+		while (hlCover(prs, query, max_cover, &p, &q))
 		{
 			/*
 			 * Count words (curlen) and interesting words (poslen) within
@@ -2542,6 +2539,7 @@ prsd_headline(PG_FUNCTION_ARGS)
 	int			shortword = 3;
 	int			max_fragments = 0;
 	bool		highlightall = false;
+	int			max_cover;
 	ListCell   *l;
 
 	/* Extract configuration option values */
@@ -2581,6 +2579,15 @@ prsd_headline(PG_FUNCTION_ARGS)
 							defel->defname)));
 	}
 
+	/*
+	 * We might eventually make max_cover a user-settable parameter, but for
+	 * now, just compute a reasonable value based on max_words and
+	 * max_fragments.
+	 */
+	max_cover = Max(max_words * 10, 100);
+	if (max_fragments > 0)
+		max_cover *= max_fragments;
+
 	/* in HighlightAll mode these parameters are ignored */
 	if (!highlightall)
 	{
@@ -2605,10 +2612,10 @@ prsd_headline(PG_FUNCTION_ARGS)
 	/* Apply appropriate headline selector */
 	if (max_fragments == 0)
 		mark_hl_words(prs, query, highlightall, shortword,
-					  min_words, max_words);
+					  min_words, max_words, max_cover);
 	else
 		mark_hl_fragments(prs, query, highlightall, shortword,
-						  min_words, max_words, max_fragments);
+						  min_words, max_words, max_fragments, max_cover);
 
 	/* Fill in default values for string options */
 	if (!prs->startsel)
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index f01b1ee25377f..756a48a167ad3 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1868,6 +1868,9 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
 	/* since this function recurses, it could be driven to stack overflow */
 	check_stack_depth();
 
+	/* ... and let's check for query cancel while we're at it */
+	CHECK_FOR_INTERRUPTS();
+
 	if (curitem->type == QI_VAL)
 		return chkcond(arg, (QueryOperand *) curitem,
 					   NULL /* don't need position info */ );

From 3d2376d55c6f2d364a6a1a95cc350c531f6d9423 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 31 Jul 2020 17:11:28 -0400
Subject: [PATCH 47/54] Fix oversight in ALTER TYPE: typmodin/typmodout must
 propagate to arrays.

If a base type supports typmods, its array type does too, with the
same interpretation.  Hence changes in pg_type.typmodin/typmodout
must be propagated to the array type.

While here, improve AlterTypeRecurse to not recurse to domains if
there is nothing we'd need to change.

Oversight in fe30e7ebf.  Back-patch to v13 where that came in.
---
 src/backend/commands/typecmds.c           | 63 +++++++++++++++++++----
 src/test/regress/expected/create_type.out | 16 ++++++
 src/test/regress/sql/create_type.sql      |  8 +++
 3 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 9e5938b10eb1d..2e107ace39be9 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -127,7 +127,8 @@ static char *domainAddConstraint(Oid domainOid, Oid domainNamespace,
 								 const char *domainName, ObjectAddress *constrAddr);
 static Node *replace_domain_constraint_value(ParseState *pstate,
 											 ColumnRef *cref);
-static void AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
+static void AlterTypeRecurse(Oid typeOid, bool isImplicitArray,
+							 HeapTuple tup, Relation catalog,
 							 AlterTypeRecurseParams *atparams);
 
 
@@ -3853,8 +3854,8 @@ AlterType(AlterTypeStmt *stmt)
 				 errmsg("%s is not a base type",
 						format_type_be(typeOid))));
 
-	/* OK, recursively update this type and any domains over it */
-	AlterTypeRecurse(typeOid, tup, catalog, &atparams);
+	/* OK, recursively update this type and any arrays/domains over it */
+	AlterTypeRecurse(typeOid, false, tup, catalog, &atparams);
 
 	/* Clean up */
 	ReleaseSysCache(tup);
@@ -3870,13 +3871,15 @@ AlterType(AlterTypeStmt *stmt)
  * AlterTypeRecurse: one recursion step for AlterType()
  *
  * Apply the changes specified by "atparams" to the type identified by
- * "typeOid", whose existing pg_type tuple is "tup".  Then search for any
- * domains over this type, and recursively apply (most of) the same changes
- * to those domains.
+ * "typeOid", whose existing pg_type tuple is "tup".  If necessary,
+ * recursively update its array type as well.  Then search for any domains
+ * over this type, and recursively apply (most of) the same changes to those
+ * domains.
  *
  * We need this because the system generally assumes that a domain inherits
  * many properties from its base type.  See DefineDomain() above for details
- * of what is inherited.
+ * of what is inherited.  Arrays inherit a smaller number of properties,
+ * but not none.
  *
  * There's a race condition here, in that some other transaction could
  * concurrently add another domain atop this base type; we'd miss updating
@@ -3888,7 +3891,8 @@ AlterType(AlterTypeStmt *stmt)
  * committed.
  */
 static void
-AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
+AlterTypeRecurse(Oid typeOid, bool isImplicitArray,
+				 HeapTuple tup, Relation catalog,
 				 AlterTypeRecurseParams *atparams)
 {
 	Datum		values[Natts_pg_type];
@@ -3949,12 +3953,43 @@ AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
 							 NULL,	/* don't have defaultExpr handy */
 							 NULL,	/* don't have typacl handy */
 							 0, /* we rejected composite types above */
-							 false, /* and we rejected implicit arrays above */
-							 false, /* so it can't be a dependent type */
+							 isImplicitArray,	/* it might be an array */
+							 isImplicitArray,	/* dependent iff it's array */
 							 true);
 
 	InvokeObjectPostAlterHook(TypeRelationId, typeOid, 0);
 
+	/*
+	 * Arrays inherit their base type's typmodin and typmodout, but none of
+	 * the other properties we're concerned with here.  Recurse to the array
+	 * type if needed.
+	 */
+	if (!isImplicitArray &&
+		(atparams->updateTypmodin || atparams->updateTypmodout))
+	{
+		Oid			arrtypoid = ((Form_pg_type) GETSTRUCT(newtup))->typarray;
+
+		if (OidIsValid(arrtypoid))
+		{
+			HeapTuple	arrtup;
+			AlterTypeRecurseParams arrparams;
+
+			arrtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(arrtypoid));
+			if (!HeapTupleIsValid(arrtup))
+				elog(ERROR, "cache lookup failed for type %u", arrtypoid);
+
+			memset(&arrparams, 0, sizeof(arrparams));
+			arrparams.updateTypmodin = atparams->updateTypmodin;
+			arrparams.updateTypmodout = atparams->updateTypmodout;
+			arrparams.typmodinOid = atparams->typmodinOid;
+			arrparams.typmodoutOid = atparams->typmodoutOid;
+
+			AlterTypeRecurse(arrtypoid, true, arrtup, catalog, &arrparams);
+
+			ReleaseSysCache(arrtup);
+		}
+	}
+
 	/*
 	 * Now we need to recurse to domains.  However, some properties are not
 	 * inherited by domains, so clear the update flags for those.
@@ -3963,6 +3998,12 @@ AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
 	atparams->updateTypmodin = false;	/* domains don't have typmods */
 	atparams->updateTypmodout = false;
 
+	/* Skip the scan if nothing remains to be done */
+	if (!(atparams->updateStorage ||
+		  atparams->updateSend ||
+		  atparams->updateAnalyze))
+		return;
+
 	/* Search pg_type for possible domains over this type */
 	ScanKeyInit(&key[0],
 				Anum_pg_type_typbasetype,
@@ -3983,7 +4024,7 @@ AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
 		if (domainForm->typtype != TYPTYPE_DOMAIN)
 			continue;
 
-		AlterTypeRecurse(domainForm->oid, domainTup, catalog, atparams);
+		AlterTypeRecurse(domainForm->oid, false, domainTup, catalog, atparams);
 	}
 
 	systable_endscan(scan);
diff --git a/src/test/regress/expected/create_type.out b/src/test/regress/expected/create_type.out
index 86a8b65450f65..f85afcb31edf1 100644
--- a/src/test/regress/expected/create_type.out
+++ b/src/test/regress/expected/create_type.out
@@ -270,6 +270,14 @@ FROM pg_type WHERE typname = 'myvarchar';
  myvarcharin | myvarcharout | myvarcharrecv | myvarcharsend | varchartypmodin | varchartypmodout | array_typanalyze | x
 (1 row)
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchar';
+ typinput | typoutput | typreceive |  typsend   |    typmodin     |    typmodout     |    typanalyze    | typstorage 
+----------+-----------+------------+------------+-----------------+------------------+------------------+------------
+ array_in | array_out | array_recv | array_send | varchartypmodin | varchartypmodout | array_typanalyze | x
+(1 row)
+
 SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
        typanalyze, typstorage
 FROM pg_type WHERE typname = 'myvarchardom';
@@ -278,6 +286,14 @@ FROM pg_type WHERE typname = 'myvarchardom';
  domain_in | myvarcharout | domain_recv | myvarcharsend | -        | -         | array_typanalyze | x
 (1 row)
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchardom';
+ typinput | typoutput | typreceive |  typsend   | typmodin | typmodout |    typanalyze    | typstorage 
+----------+-----------+------------+------------+----------+-----------+------------------+------------
+ array_in | array_out | array_recv | array_send | -        | -         | array_typanalyze | x
+(1 row)
+
 -- ensure dependencies are straight
 DROP FUNCTION myvarcharsend(myvarchar);  -- fail
 ERROR:  cannot drop function myvarcharsend(myvarchar) because other objects depend on it
diff --git a/src/test/regress/sql/create_type.sql b/src/test/regress/sql/create_type.sql
index 5b176bb2aed02..584ece06701d3 100644
--- a/src/test/regress/sql/create_type.sql
+++ b/src/test/regress/sql/create_type.sql
@@ -214,10 +214,18 @@ SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
        typanalyze, typstorage
 FROM pg_type WHERE typname = 'myvarchar';
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchar';
+
 SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
        typanalyze, typstorage
 FROM pg_type WHERE typname = 'myvarchardom';
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchardom';
+
 -- ensure dependencies are straight
 DROP FUNCTION myvarcharsend(myvarchar);  -- fail
 DROP TYPE myvarchar;  -- fail

From c79aed4f793086300abfc188def94b5c0bd0b45d Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 31 Jul 2020 15:34:28 -0700
Subject: [PATCH 48/54] Restore lost amcheck TOAST test coverage.

Commit eba77534 fixed an amcheck false positive bug involving
inconsistencies in TOAST input state between table and index.  A test
case was added that verified that such an inconsistency didn't result in
a spurious corruption related error.

Test coverage from the test was accidentally lost by commit 501e41dd,
which propagated ALTER TABLE ...  SET STORAGE attstorage state to
indexes.  This broke the test because the test specifically relied on
attstorage not being propagated.  This artificially forced there to be
index tuples whose datums were equivalent to the datums in the heap
without the datums actually being bitwise equal.

Fix this by updating pg_attribute directly instead.  Commit 501e41dd
made similar changes to a test_decoding TOAST-related test case which
made the same assumption, but overlooked the amcheck test case.

Backpatch: 11-, just like commit eba77534 (and commit 501e41dd).
---
 contrib/amcheck/expected/check_btree.out | 16 ++++++++++++----
 contrib/amcheck/sql/check_btree.sql      | 14 ++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out
index f82f48d23b78f..13848b7449b7f 100644
--- a/contrib/amcheck/expected/check_btree.out
+++ b/contrib/amcheck/expected/check_btree.out
@@ -155,11 +155,19 @@ SELECT bt_index_parent_check('delete_test_table_pkey', true);
 -- tuple.  Bloom filter must fingerprint normalized index tuple representation.
 --
 CREATE TABLE toast_bug(buggy text);
-ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE plain;
--- pg_attribute entry for toasty.buggy will have plain storage:
-CREATE INDEX toasty ON toast_bug(buggy);
--- Whereas pg_attribute entry for toast_bug.buggy now has extended storage:
 ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended;
+CREATE INDEX toasty ON toast_bug(buggy);
+-- pg_attribute entry for toasty.buggy (the index) will have plain storage:
+UPDATE pg_attribute SET attstorage = 'p'
+WHERE attrelid = 'toasty'::regclass AND attname = 'buggy';
+-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage:
+SELECT attstorage FROM pg_attribute
+WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy';
+ attstorage 
+------------
+ x
+(1 row)
+
 -- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD):
 INSERT INTO toast_bug SELECT repeat('a', 2200);
 -- Should not get false positive report of corruption:
diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql
index a1fef644cb08c..97a3e1a20d501 100644
--- a/contrib/amcheck/sql/check_btree.sql
+++ b/contrib/amcheck/sql/check_btree.sql
@@ -99,11 +99,17 @@ SELECT bt_index_parent_check('delete_test_table_pkey', true);
 -- tuple.  Bloom filter must fingerprint normalized index tuple representation.
 --
 CREATE TABLE toast_bug(buggy text);
-ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE plain;
--- pg_attribute entry for toasty.buggy will have plain storage:
-CREATE INDEX toasty ON toast_bug(buggy);
--- Whereas pg_attribute entry for toast_bug.buggy now has extended storage:
 ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended;
+CREATE INDEX toasty ON toast_bug(buggy);
+
+-- pg_attribute entry for toasty.buggy (the index) will have plain storage:
+UPDATE pg_attribute SET attstorage = 'p'
+WHERE attrelid = 'toasty'::regclass AND attname = 'buggy';
+
+-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage:
+SELECT attstorage FROM pg_attribute
+WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy';
+
 -- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD):
 INSERT INTO toast_bug SELECT repeat('a', 2200);
 -- Should not get false positive report of corruption:

From 84c0e4b9bce794da914fe9c062753bf21369745f Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Sat, 1 Aug 2020 12:16:15 +1200
Subject: [PATCH 49/54] Improve programmer docs for simplehash and dynahash.

When reading the code it's not obvious when one should prefer dynahash
over simplehash and vice-versa, so, for programmer-friendliness, add
comments to inform that decision.

Show sample simplehash method signatures.

Author: James Coleman <jtc331@gmail.com>
Discussion: https://postgr.es/m/CAAaqYe_dOF39gAJ8rL-a3YO3Qo96MHMRQ2whFjK5ZcU6YvMQSA%40mail.gmail.com
---
 src/backend/utils/hash/dynahash.c | 12 ++++-
 src/include/lib/simplehash.h      | 73 +++++++++++++++++++++++++++++--
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 5948b01abc34e..f4fbccdd7e444 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * dynahash.c
- *	  dynamic hash tables
+ *	  dynamic chained hash tables
  *
  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
  * shared memory.  For shared hash tables, it is the caller's responsibility
@@ -41,6 +41,16 @@
  * function must be supplied; comparison defaults to memcmp() and key copying
  * to memcpy() when a user-defined hashing function is selected.
  *
+ * Compared to simplehash, dynahash has the following benefits:
+ *
+ * - It supports partitioning, which is useful for shared memory access using
+ *   locks.
+ * - Shared memory hashes are allocated in a fixed size area at startup and
+ *   are discoverable by name from other processes.
+ * - Because entries don't need to be moved in the case of hash conflicts, has
+ *   better performance for large entries
+ * - Guarantees stable pointers to entries.
+ *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 90dfa8a695dd2..96f0c21f60644 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -1,10 +1,27 @@
 /*
  * simplehash.h
  *
- *	  Hash table implementation which will be specialized to user-defined
- *	  types, by including this file to generate the required code.  It's
- *	  probably not worthwhile to do so for hash tables that aren't performance
- *	  or space sensitive.
+ *	  When included this file generates a "templated" (by way of macros)
+ *	  open-addressing hash table implementation specialized to user-defined
+ *	  types.
+ *
+ *	  It's probably not worthwhile to generate such a specialized implementation
+ *	  for hash tables that aren't performance or space sensitive.
+ *
+ *	  Compared to dynahash, simplehash has the following benefits:
+ *
+ *	  - Due to the "templated" code generation has known structure sizes and no
+ *	    indirect function calls (which show up substantially in dynahash
+ *	    profiles). These features considerably increase speed for small
+ *	    entries.
+ *	  - Open addressing has better CPU cache behavior than dynahash's chained
+ *	    hashtables.
+ *	  - The generated interface is type-safe and easier to use than dynahash,
+ *	    though at the cost of more complex setup.
+ *	  - Allocates memory in a MemoryContext or another allocator with a
+ *	    malloc/free style interface (which isn't easily usable in a shared
+ *	    memory context)
+ *	  - Does not require the overhead of a separate memory context.
  *
  * Usage notes:
  *
@@ -34,6 +51,19 @@
  *	  - SH_STORE_HASH - if defined the hash is stored in the elements
  *	  - SH_GET_HASH(tb, a) - return the field to store the hash in
  *
+ *	  The element type is required to contain a "uint32 status" member.
+ *
+ *	  While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
+ *	  the hash table implementation needs to compare hashes to move elements
+ *	  (particularly when growing the hash), it's preferable, if possible, to
+ *	  store the element's hash in the element's data type. If the hash is so
+ *	  stored, the hash table will also compare hashes before calling SH_EQUAL
+ *	  when comparing two keys.
+ *
+ *	  For convenience the hash table create functions accept a void pointer
+ *	  that will be stored in the hash table type's member private_data. This
+ *	  allows callbacks to reference caller provided data.
+ *
  *	  For examples of usage look at tidbitmap.c (file local definition) and
  *	  execnodes.h/execGrouping.c (exposed declaration, file local
  *	  implementation).
@@ -149,24 +179,59 @@ typedef struct SH_ITERATOR
 
 /* externally visible function prototypes */
 #ifdef SH_RAW_ALLOCATOR
+/* <prefix>_hash <prefix>_create(uint32 nelements, void *private_data) */
 SH_SCOPE	SH_TYPE *SH_CREATE(uint32 nelements, void *private_data);
 #else
+/*
+ * <prefix>_hash <prefix>_create(MemoryContext ctx, uint32 nelements,
+ *								 void *private_data)
+ */
 SH_SCOPE	SH_TYPE *SH_CREATE(MemoryContext ctx, uint32 nelements,
 							   void *private_data);
 #endif
+
+/* void <prefix>_destroy(<prefix>_hash *tb) */
 SH_SCOPE void SH_DESTROY(SH_TYPE * tb);
+
+/* void <prefix>_reset(<prefix>_hash *tb) */
 SH_SCOPE void SH_RESET(SH_TYPE * tb);
+
+/* void <prefix>_grow(<prefix>_hash *tb) */
 SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize);
+
+/* <element> *<prefix>_insert(<prefix>_hash *tb, <key> key, bool *found) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found);
+
+/*
+ * <element> *<prefix>_insert_hash(<prefix>_hash *tb, <key> key, uint32 hash,
+ * 								  bool *found)
+ */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key,
 											uint32 hash, bool *found);
+
+/* <element> *<prefix>_lookup(<prefix>_hash *tb, <key> key) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key);
+
+/* <element> *<prefix>_lookup_hash(<prefix>_hash *tb, <key> key, uint32 hash) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key,
 											uint32 hash);
+
+/* bool <prefix>_delete(<prefix>_hash *tb, <key> key) */
 SH_SCOPE bool SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key);
+
+/* void <prefix>_start_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
 SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
+
+/*
+ * void <prefix>_start_iterate_at(<prefix>_hash *tb, <prefix>_iterator *iter,
+ *								  uint32 at)
+ */
 SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
+
+/* <element> *<prefix>_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
+
+/* void <prefix>_stat(<prefix>_hash *tb */
 SH_SCOPE void SH_STAT(SH_TYPE * tb);
 
 #endif							/* SH_DECLARE */

From 022350b8495a8a7ff0ff8dd6791572e91e7cd6fe Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sat, 1 Aug 2020 11:49:13 +0900
Subject: [PATCH 50/54] Minimize slot creation for multi-inserts of pg_shdepend

When doing multiple insertions in pg_shdepend for the copy of
dependencies from a template database in CREATE DATABASE, the same
number of slots would have been created and used all the time.  As the
number of items to insert is not known in advance, this makes most of
the slots created for nothing.  This improves the slot handling so as
slot creation only happens when needed, minimizing the overhead of the
operation.

Author: Michael Paquier
Reviewed-by: Daniel Gustafsson
Discussion: https://postgr.es/m/20200731024148.GB3317@paquier.xyz
---
 src/backend/catalog/pg_shdepend.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c
index ef2b87927ceb5..30b234e90e124 100644
--- a/src/backend/catalog/pg_shdepend.c
+++ b/src/backend/catalog/pg_shdepend.c
@@ -809,15 +809,19 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	int			slotCount;
 	CatalogIndexState indstate;
 	TupleTableSlot **slot;
-	int			nslots;
+	int			nslots,
+				max_slots;
+	bool		slot_init = true;
 
 	sdepRel = table_open(SharedDependRelationId, RowExclusiveLock);
 	sdepDesc = RelationGetDescr(sdepRel);
 
-	nslots = MAX_PGSHDEPEND_INSERT_BYTES / sizeof(FormData_pg_shdepend);
-	slot = palloc(sizeof(TupleTableSlot *) * nslots);
-	for (int i = 0; i < nslots; i++)
-		slot[i] = MakeSingleTupleTableSlot(sdepDesc, &TTSOpsHeapTuple);
+	/*
+	 * Allocate the slots to use, but delay initialization until we know that
+	 * they will be used.
+	 */
+	max_slots = MAX_PGSHDEPEND_INSERT_BYTES / sizeof(FormData_pg_shdepend);
+	slot = palloc(sizeof(TupleTableSlot *) * max_slots);
 
 	indstate = CatalogOpenIndexes(sdepRel);
 
@@ -842,6 +846,9 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	{
 		Form_pg_shdepend shdep;
 
+		if (slot_init)
+			slot[slotCount] = MakeSingleTupleTableSlot(sdepDesc, &TTSOpsHeapTuple);
+
 		ExecClearTuple(slot[slotCount]);
 
 		shdep = (Form_pg_shdepend) GETSTRUCT(tup);
@@ -858,10 +865,11 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 		slotCount++;
 
 		/* If slots are full, insert a batch of tuples */
-		if (slotCount == nslots)
+		if (slotCount == max_slots)
 		{
 			CatalogTuplesMultiInsertWithInfo(sdepRel, slot, slotCount, indstate);
 			slotCount = 0;
+			slot_init = false;
 		}
 	}
 
@@ -874,6 +882,8 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	CatalogCloseIndexes(indstate);
 	table_close(sdepRel, RowExclusiveLock);
 
+	/* Drop only the number of slots used */
+	nslots = slot_init ? slotCount : max_slots;
 	for (int i = 0; i < nslots; i++)
 		ExecDropSingleTupleTableSlot(slot[i]);
 	pfree(slot);

From e2b37d9e7cabc90633c4bd822e1bcfdd1bda44c4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Sat, 1 Aug 2020 23:39:36 +1200
Subject: [PATCH 51/54] Use pg_pread() and pg_pwrite() in slru.c.

This avoids lseek() system calls at every SLRU I/O, as was
done for relation files in commit c24dcd0c.

Reviewed-by: Ashwin Agrawal <aagrawal@pivotal.io>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKG%2Biqke4uTRFj8D8uEUUgj%2BRokPSp%2BCWM6YYzaaamG9Wvg%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGJ%2BoHhnvqjn3%3DHro7xu-YDR8FPr0FL6LF35kHRX%3D_bUzg%40mail.gmail.com
---
 src/backend/access/transam/slru.c | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 61249f4a12dff..9e145f1c36acb 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -669,7 +669,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 	SlruShared	shared = ctl->shared;
 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
 	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
-	int			offset = rpageno * BLCKSZ;
+	off_t		offset = rpageno * BLCKSZ;
 	char		path[MAXPGPATH];
 	int			fd;
 
@@ -699,17 +699,9 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 		return true;
 	}
 
-	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
-	{
-		slru_errcause = SLRU_SEEK_FAILED;
-		slru_errno = errno;
-		CloseTransientFile(fd);
-		return false;
-	}
-
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
-	if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
+	if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
 	{
 		pgstat_report_wait_end();
 		slru_errcause = SLRU_READ_FAILED;
@@ -749,7 +741,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	SlruShared	shared = ctl->shared;
 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
 	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
-	int			offset = rpageno * BLCKSZ;
+	off_t		offset = rpageno * BLCKSZ;
 	char		path[MAXPGPATH];
 	int			fd = -1;
 
@@ -862,18 +854,9 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 		}
 	}
 
-	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
-	{
-		slru_errcause = SLRU_SEEK_FAILED;
-		slru_errno = errno;
-		if (!fdata)
-			CloseTransientFile(fd);
-		return false;
-	}
-
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
-	if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
+	if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
 	{
 		pgstat_report_wait_end();
 		/* if write didn't set errno, assume problem is no disk space */

From 9f9682783bea74bf8d93cac4f7dd65fa677f5dc7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 1 Aug 2020 17:12:47 -0400
Subject: [PATCH 52/54] Invent "amadjustmembers" AM method for validating
 opclass members.

This allows AM-specific knowledge to be applied during creation of
pg_amop and pg_amproc entries.  Specifically, the AM knows better than
core code which entries to consider as required or optional.  Giving
the latter entries the appropriate sort of dependency allows them to
be dropped without taking out the whole opclass or opfamily; which
is something we'd like to have to correct obsolescent entries in
extensions.

This callback also opens the door to performing AM-specific validity
checks during opclass creation, rather than hoping than an opclass
developer will remember to test with "amvalidate".  For the most part
I've not actually added any such checks yet; that can happen in a
follow-on patch.  (Note that we shouldn't remove any tests from
"amvalidate", as those are still needed to cross-check manually
constructed entries in the initdb data.  So adding tests to
"amadjustmembers" will be somewhat duplicative, but it seems like
a good idea anyway.)

Patch by me, reviewed by Alexander Korotkov, Hamid Akhtar, and
Anastasia Lubennikova.

Discussion: https://postgr.es/m/4578.1565195302@sss.pgh.pa.us
---
 contrib/bloom/blutils.c                 |   1 +
 doc/src/sgml/indexam.sgml               |  44 +++++-
 src/backend/access/brin/brin.c          |   1 +
 src/backend/access/gin/ginutil.c        |   1 +
 src/backend/access/gin/ginvalidate.c    |  65 +++++++++
 src/backend/access/gist/gist.c          |   1 +
 src/backend/access/gist/gistvalidate.c  |  69 ++++++++++
 src/backend/access/hash/hash.c          |   1 +
 src/backend/access/hash/hashvalidate.c  |  96 +++++++++++++
 src/backend/access/index/amvalidate.c   |  39 ++++--
 src/backend/access/nbtree/nbtree.c      |   1 +
 src/backend/access/nbtree/nbtvalidate.c |  96 +++++++++++++
 src/backend/access/spgist/spgutils.c    |   1 +
 src/backend/access/spgist/spgvalidate.c |  66 +++++++++
 src/backend/commands/opclasscmds.c      | 170 ++++++++++++++----------
 src/bin/pg_dump/t/002_pg_dump.pl        |  10 +-
 src/include/access/amapi.h              |  43 ++++++
 src/include/access/amvalidate.h         |   5 +-
 src/include/access/gin_private.h        |   4 +
 src/include/access/gist_private.h       |   4 +
 src/include/access/hash.h               |   4 +
 src/include/access/nbtree.h             |   4 +
 src/include/access/spgist.h             |   4 +
 src/include/catalog/opfam_internal.h    |  28 ----
 24 files changed, 646 insertions(+), 112 deletions(-)
 delete mode 100644 src/include/catalog/opfam_internal.h

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index d3bf8665df1f9..26b9927c3aaf9 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -139,6 +139,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = blvalidate;
+	amroutine->amadjustmembers = NULL;
 	amroutine->ambeginscan = blbeginscan;
 	amroutine->amrescan = blrescan;
 	amroutine->amgettuple = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index af87f172a7cd0..1aea4db707d5d 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -143,6 +143,7 @@ typedef struct IndexAmRoutine
     amproperty_function amproperty;     /* can be NULL */
     ambuildphasename_function ambuildphasename;   /* can be NULL */
     amvalidate_function amvalidate;
+    amadjustmembers_function amadjustmembers; /* can be NULL */
     ambeginscan_function ambeginscan;
     amrescan_function amrescan;
     amgettuple_function amgettuple;     /* can be NULL */
@@ -502,7 +503,48 @@ amvalidate (Oid opclassoid);
    the access method can reasonably do that.  For example, this might include
    testing that all required support functions are provided.
    The <function>amvalidate</function> function must return false if the opclass is
-   invalid.  Problems should be reported with <function>ereport</function> messages.
+   invalid.  Problems should be reported with <function>ereport</function>
+   messages, typically at <literal>INFO</literal> level.
+  </para>
+
+  <para>
+<programlisting>
+void
+amadjustmembers (Oid opfamilyoid,
+                 Oid opclassoid,
+                 List *operators,
+                 List *functions);
+</programlisting>
+   Validate proposed new operator and function members of an operator family,
+   so far as the access method can reasonably do that, and set their
+   dependency types if the default is not satisfactory.  This is called
+   during <command>CREATE OPERATOR CLASS</command> and during
+   <command>ALTER OPERATOR FAMILY ADD</command>; in the latter
+   case <parameter>opclassoid</parameter> is <literal>InvalidOid</literal>.
+   The <type>List</type> arguments are lists
+   of <structname>OpFamilyMember</structname> structs, as defined
+   in <filename>amapi.h</filename>.
+
+   Tests done by this function will typically be a subset of those
+   performed by <function>amvalidate</function>,
+   since <function>amadjustmembers</function> cannot assume that it is
+   seeing a complete set of members.  For example, it would be reasonable
+   to check the signature of a support function, but not to check whether
+   all required support functions are provided.  Any problems can be
+   reported by throwing an error.
+
+   The dependency-related fields of
+   the <structname>OpFamilyMember</structname> structs are initialized by
+   the core code to create hard dependencies on the opclass if this
+   is <command>CREATE OPERATOR CLASS</command>, or soft dependencies on the
+   opfamily if this is <command>ALTER OPERATOR FAMILY ADD</command>.
+   <function>amadjustmembers</function> can adjust these fields if some other
+   behavior is more appropriate.  For example, GIN, GiST, and SP-GiST
+   always set operator members to have soft dependencies on the opfamily,
+   since the connection between an operator and an opclass is relatively
+   weak in these index types; so it is reasonable to allow operator members
+   to be added and removed freely.  Optional support functions are typically
+   also given soft dependencies, so that they can be removed if necessary.
   </para>
 
 
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 7db3ae5ee0cf5..1f72562c60307 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -120,6 +120,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = brinvalidate;
+	amroutine->amadjustmembers = NULL;
 	amroutine->ambeginscan = brinbeginscan;
 	amroutine->amrescan = brinrescan;
 	amroutine->amgettuple = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index a400f1fedbc55..ef9b56fd363af 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -71,6 +71,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = ginvalidate;
+	amroutine->amadjustmembers = ginadjustmembers;
 	amroutine->ambeginscan = ginbeginscan;
 	amroutine->amrescan = ginrescan;
 	amroutine->amgettuple = NULL;
diff --git a/src/backend/access/gin/ginvalidate.c b/src/backend/access/gin/ginvalidate.c
index 1e3046f4eb7c8..60ce1ae10663b 100644
--- a/src/backend/access/gin/ginvalidate.c
+++ b/src/backend/access/gin/ginvalidate.c
@@ -271,3 +271,68 @@ ginvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a GIN opfamily.
+ */
+void
+ginadjustmembers(Oid opfamilyoid,
+				 Oid opclassoid,
+				 List *operators,
+				 List *functions)
+{
+	ListCell   *lc;
+
+	/*
+	 * Operator members of a GIN opfamily should never have hard dependencies,
+	 * since their connection to the opfamily depends only on what the support
+	 * functions think, and that can be altered.  For consistency, we make all
+	 * soft dependencies point to the opfamily, though a soft dependency on
+	 * the opclass would work as well in the CREATE OPERATOR CLASS case.
+	 */
+	foreach(lc, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		op->ref_is_hard = false;
+		op->ref_is_family = true;
+		op->refobjid = opfamilyoid;
+	}
+
+	/*
+	 * Required support functions should have hard dependencies.  Preferably
+	 * those are just dependencies on the opclass, but if we're in ALTER
+	 * OPERATOR FAMILY, we leave the dependency pointing at the whole
+	 * opfamily.  (Given that GIN opclasses generally don't share opfamilies,
+	 * it seems unlikely to be worth working harder.)
+	 */
+	foreach(lc, functions)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		switch (op->number)
+		{
+			case GIN_EXTRACTVALUE_PROC:
+			case GIN_EXTRACTQUERY_PROC:
+				/* Required support function */
+				op->ref_is_hard = true;
+				break;
+			case GIN_COMPARE_PROC:
+			case GIN_CONSISTENT_PROC:
+			case GIN_COMPARE_PARTIAL_PROC:
+			case GIN_TRICONSISTENT_PROC:
+			case GIN_OPTIONS_PROC:
+				/* Optional, so force it to be a soft family dependency */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+				break;
+			default:
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("support function number %d is invalid for access method %s",
+								op->number, "gin")));
+				break;
+		}
+	}
+}
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 79fe6eb8d62c9..25b42e38f224b 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -92,6 +92,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = gistproperty;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = gistvalidate;
+	amroutine->amadjustmembers = gistadjustmembers;
 	amroutine->ambeginscan = gistbeginscan;
 	amroutine->amrescan = gistrescan;
 	amroutine->amgettuple = gistgettuple;
diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c
index a285736a81092..2b9ab693be188 100644
--- a/src/backend/access/gist/gistvalidate.c
+++ b/src/backend/access/gist/gistvalidate.c
@@ -279,3 +279,72 @@ gistvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a GiST opfamily.
+ */
+void
+gistadjustmembers(Oid opfamilyoid,
+				  Oid opclassoid,
+				  List *operators,
+				  List *functions)
+{
+	ListCell   *lc;
+
+	/*
+	 * Operator members of a GiST opfamily should never have hard
+	 * dependencies, since their connection to the opfamily depends only on
+	 * what the support functions think, and that can be altered.  For
+	 * consistency, we make all soft dependencies point to the opfamily,
+	 * though a soft dependency on the opclass would work as well in the
+	 * CREATE OPERATOR CLASS case.
+	 */
+	foreach(lc, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		op->ref_is_hard = false;
+		op->ref_is_family = true;
+		op->refobjid = opfamilyoid;
+	}
+
+	/*
+	 * Required support functions should have hard dependencies.  Preferably
+	 * those are just dependencies on the opclass, but if we're in ALTER
+	 * OPERATOR FAMILY, we leave the dependency pointing at the whole
+	 * opfamily.  (Given that GiST opclasses generally don't share opfamilies,
+	 * it seems unlikely to be worth working harder.)
+	 */
+	foreach(lc, functions)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		switch (op->number)
+		{
+			case GIST_CONSISTENT_PROC:
+			case GIST_UNION_PROC:
+			case GIST_PENALTY_PROC:
+			case GIST_PICKSPLIT_PROC:
+			case GIST_EQUAL_PROC:
+				/* Required support function */
+				op->ref_is_hard = true;
+				break;
+			case GIST_COMPRESS_PROC:
+			case GIST_DECOMPRESS_PROC:
+			case GIST_DISTANCE_PROC:
+			case GIST_FETCH_PROC:
+			case GIST_OPTIONS_PROC:
+				/* Optional, so force it to be a soft family dependency */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+				break;
+			default:
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("support function number %d is invalid for access method %s",
+								op->number, "gist")));
+				break;
+		}
+	}
+}
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 3ec6d528e77f7..7c9ccf446c8a4 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -89,6 +89,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = hashvalidate;
+	amroutine->amadjustmembers = hashadjustmembers;
 	amroutine->ambeginscan = hashbeginscan;
 	amroutine->amrescan = hashrescan;
 	amroutine->amgettuple = hashgettuple;
diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c
index 6f14a9fb455d0..0fe97e8276b63 100644
--- a/src/backend/access/hash/hashvalidate.c
+++ b/src/backend/access/hash/hashvalidate.c
@@ -16,6 +16,8 @@
 #include "access/amvalidate.h"
 #include "access/hash.h"
 #include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/pg_am.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_opclass.h"
@@ -25,6 +27,7 @@
 #include "parser/parse_coerce.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
 #include "utils/regproc.h"
 #include "utils/syscache.h"
 
@@ -341,3 +344,96 @@ check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype)
 	ReleaseSysCache(tp);
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a hash opfamily.
+ */
+void
+hashadjustmembers(Oid opfamilyoid,
+				  Oid opclassoid,
+				  List *operators,
+				  List *functions)
+{
+	Oid			opcintype;
+	ListCell   *lc;
+
+	/*
+	 * Hash operators and required support functions are always "loose"
+	 * members of the opfamily if they are cross-type.  If they are not
+	 * cross-type, we prefer to tie them to the appropriate opclass ... but if
+	 * the user hasn't created one, we can't do that, and must fall back to
+	 * using the opfamily dependency.  (We mustn't force creation of an
+	 * opclass in such a case, as leaving an incomplete opclass laying about
+	 * would be bad.  Throwing an error is another undesirable alternative.)
+	 *
+	 * This behavior results in a bit of a dump/reload hazard, in that the
+	 * order of restoring objects could affect what dependencies we end up
+	 * with.  pg_dump's existing behavior will preserve the dependency choices
+	 * in most cases, but not if a cross-type operator has been bound tightly
+	 * into an opclass.  That's a mistake anyway, so silently "fixing" it
+	 * isn't awful.
+	 *
+	 * Optional support functions are always "loose" family members.
+	 *
+	 * To avoid repeated lookups, we remember the most recently used opclass's
+	 * input type.
+	 */
+	if (OidIsValid(opclassoid))
+	{
+		/* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
+		CommandCounterIncrement();
+		opcintype = get_opclass_input_type(opclassoid);
+	}
+	else
+		opcintype = InvalidOid;
+
+	/*
+	 * We handle operators and support functions almost identically, so rather
+	 * than duplicate this code block, just join the lists.
+	 */
+	foreach(lc, list_concat_copy(operators, functions))
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		if (op->is_func && op->number != HASHSTANDARD_PROC)
+		{
+			/* Optional support proc, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else if (op->lefttype != op->righttype)
+		{
+			/* Cross-type, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else
+		{
+			/* Not cross-type; is there a suitable opclass? */
+			if (op->lefttype != opcintype)
+			{
+				/* Avoid repeating this expensive lookup, even if it fails */
+				opcintype = op->lefttype;
+				opclassoid = opclass_for_family_datatype(HASH_AM_OID,
+														 opfamilyoid,
+														 opcintype);
+			}
+			if (OidIsValid(opclassoid))
+			{
+				/* Hard dependency on opclass */
+				op->ref_is_hard = true;
+				op->ref_is_family = false;
+				op->refobjid = opclassoid;
+			}
+			else
+			{
+				/* We're stuck, so make a soft dependency on the opfamily */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+			}
+		}
+	}
+}
diff --git a/src/backend/access/index/amvalidate.c b/src/backend/access/index/amvalidate.c
index 24d49750adaef..b58c34aa5f2fd 100644
--- a/src/backend/access/index/amvalidate.c
+++ b/src/backend/access/index/amvalidate.c
@@ -1,7 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * amvalidate.c
- *	  Support routines for index access methods' amvalidate functions.
+ *	  Support routines for index access methods' amvalidate and
+ *	  amadjustmembers functions.
  *
  * Copyright (c) 2016-2020, PostgreSQL Global Development Group
  *
@@ -222,21 +223,28 @@ check_amop_signature(Oid opno, Oid restype, Oid lefttype, Oid righttype)
 }
 
 /*
- * Is the datatype a legitimate input type for the btree opfamily?
+ * Get the OID of the opclass belonging to an opfamily and accepting
+ * the specified type as input type.  Returns InvalidOid if no such opclass.
+ *
+ * If there is more than one such opclass, you get a random one of them.
+ * Since that shouldn't happen, we don't waste cycles checking.
+ *
+ * We could look up the AM's OID from the opfamily, but all existing callers
+ * know that or can get it without an extra lookup, so we make them pass it.
  */
-bool
-opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
+Oid
+opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, Oid datatypeoid)
 {
-	bool		result = false;
+	Oid			result = InvalidOid;
 	CatCList   *opclist;
 	int			i;
 
 	/*
-	 * We search through all btree opclasses to see if one matches.  This is a
-	 * bit inefficient but there is no better index available.  It also saves
-	 * making an explicit check that the opfamily belongs to btree.
+	 * We search through all the AM's opclasses to see if one matches.  This
+	 * is a bit inefficient but there is no better index available.  It also
+	 * saves making an explicit check that the opfamily belongs to the AM.
 	 */
-	opclist = SearchSysCacheList1(CLAAMNAMENSP, ObjectIdGetDatum(BTREE_AM_OID));
+	opclist = SearchSysCacheList1(CLAAMNAMENSP, ObjectIdGetDatum(amoid));
 
 	for (i = 0; i < opclist->n_members; i++)
 	{
@@ -246,7 +254,7 @@ opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
 		if (classform->opcfamily == opfamilyoid &&
 			classform->opcintype == datatypeoid)
 		{
-			result = true;
+			result = classform->oid;
 			break;
 		}
 	}
@@ -255,3 +263,14 @@ opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
 
 	return result;
 }
+
+/*
+ * Is the datatype a legitimate input type for the btree opfamily?
+ */
+bool
+opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
+{
+	return OidIsValid(opclass_for_family_datatype(BTREE_AM_OID,
+												  opfamilyoid,
+												  datatypeoid));
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index d65f4357cc8bd..49a8a9708e381 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -141,6 +141,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = btproperty;
 	amroutine->ambuildphasename = btbuildphasename;
 	amroutine->amvalidate = btvalidate;
+	amroutine->amadjustmembers = btadjustmembers;
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
 	amroutine->amgettuple = btgettuple;
diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c
index 02905f79c8261..5be728ad07cff 100644
--- a/src/backend/access/nbtree/nbtvalidate.c
+++ b/src/backend/access/nbtree/nbtvalidate.c
@@ -16,12 +16,15 @@
 #include "access/amvalidate.h"
 #include "access/htup_details.h"
 #include "access/nbtree.h"
+#include "access/xact.h"
+#include "catalog/pg_am.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_opfamily.h"
 #include "catalog/pg_type.h"
 #include "utils/builtins.h"
+#include "utils/lsyscache.h"
 #include "utils/regproc.h"
 #include "utils/syscache.h"
 
@@ -282,3 +285,96 @@ btvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a btree opfamily.
+ */
+void
+btadjustmembers(Oid opfamilyoid,
+				Oid opclassoid,
+				List *operators,
+				List *functions)
+{
+	Oid			opcintype;
+	ListCell   *lc;
+
+	/*
+	 * Btree operators and comparison support functions are always "loose"
+	 * members of the opfamily if they are cross-type.  If they are not
+	 * cross-type, we prefer to tie them to the appropriate opclass ... but if
+	 * the user hasn't created one, we can't do that, and must fall back to
+	 * using the opfamily dependency.  (We mustn't force creation of an
+	 * opclass in such a case, as leaving an incomplete opclass laying about
+	 * would be bad.  Throwing an error is another undesirable alternative.)
+	 *
+	 * This behavior results in a bit of a dump/reload hazard, in that the
+	 * order of restoring objects could affect what dependencies we end up
+	 * with.  pg_dump's existing behavior will preserve the dependency choices
+	 * in most cases, but not if a cross-type operator has been bound tightly
+	 * into an opclass.  That's a mistake anyway, so silently "fixing" it
+	 * isn't awful.
+	 *
+	 * Optional support functions are always "loose" family members.
+	 *
+	 * To avoid repeated lookups, we remember the most recently used opclass's
+	 * input type.
+	 */
+	if (OidIsValid(opclassoid))
+	{
+		/* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
+		CommandCounterIncrement();
+		opcintype = get_opclass_input_type(opclassoid);
+	}
+	else
+		opcintype = InvalidOid;
+
+	/*
+	 * We handle operators and support functions almost identically, so rather
+	 * than duplicate this code block, just join the lists.
+	 */
+	foreach(lc, list_concat_copy(operators, functions))
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		if (op->is_func && op->number != BTORDER_PROC)
+		{
+			/* Optional support proc, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else if (op->lefttype != op->righttype)
+		{
+			/* Cross-type, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else
+		{
+			/* Not cross-type; is there a suitable opclass? */
+			if (op->lefttype != opcintype)
+			{
+				/* Avoid repeating this expensive lookup, even if it fails */
+				opcintype = op->lefttype;
+				opclassoid = opclass_for_family_datatype(BTREE_AM_OID,
+														 opfamilyoid,
+														 opcintype);
+			}
+			if (OidIsValid(opclassoid))
+			{
+				/* Hard dependency on opclass */
+				op->ref_is_hard = true;
+				op->ref_is_family = false;
+				op->refobjid = opclassoid;
+			}
+			else
+			{
+				/* We're stuck, so make a soft dependency on the opfamily */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+			}
+		}
+	}
+}
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 0efe05e552b63..64d3ba82887bd 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -74,6 +74,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = spgproperty;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = spgvalidate;
+	amroutine->amadjustmembers = spgadjustmembers;
 	amroutine->ambeginscan = spgbeginscan;
 	amroutine->amrescan = spgrescan;
 	amroutine->amgettuple = spggettuple;
diff --git a/src/backend/access/spgist/spgvalidate.c b/src/backend/access/spgist/spgvalidate.c
index f0cfd8b42b1ed..d4f5841e2656b 100644
--- a/src/backend/access/spgist/spgvalidate.c
+++ b/src/backend/access/spgist/spgvalidate.c
@@ -303,3 +303,69 @@ spgvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to an SP-GiST opfamily.
+ */
+void
+spgadjustmembers(Oid opfamilyoid,
+				 Oid opclassoid,
+				 List *operators,
+				 List *functions)
+{
+	ListCell   *lc;
+
+	/*
+	 * Operator members of an SP-GiST opfamily should never have hard
+	 * dependencies, since their connection to the opfamily depends only on
+	 * what the support functions think, and that can be altered.  For
+	 * consistency, we make all soft dependencies point to the opfamily,
+	 * though a soft dependency on the opclass would work as well in the
+	 * CREATE OPERATOR CLASS case.
+	 */
+	foreach(lc, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		op->ref_is_hard = false;
+		op->ref_is_family = true;
+		op->refobjid = opfamilyoid;
+	}
+
+	/*
+	 * Required support functions should have hard dependencies.  Preferably
+	 * those are just dependencies on the opclass, but if we're in ALTER
+	 * OPERATOR FAMILY, we leave the dependency pointing at the whole
+	 * opfamily.  (Given that SP-GiST opclasses generally don't share
+	 * opfamilies, it seems unlikely to be worth working harder.)
+	 */
+	foreach(lc, functions)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		switch (op->number)
+		{
+			case SPGIST_CONFIG_PROC:
+			case SPGIST_CHOOSE_PROC:
+			case SPGIST_PICKSPLIT_PROC:
+			case SPGIST_INNER_CONSISTENT_PROC:
+			case SPGIST_LEAF_CONSISTENT_PROC:
+				/* Required support function */
+				op->ref_is_hard = true;
+				break;
+			case SPGIST_COMPRESS_PROC:
+			case SPGIST_OPTIONS_PROC:
+				/* Optional, so force it to be a soft family dependency */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+				break;
+			default:
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("support function number %d is invalid for access method %s",
+								op->number, "spgist")));
+				break;
+		}
+	}
+}
diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c
index 351866f9f22a6..28395d5946f3a 100644
--- a/src/backend/commands/opclasscmds.c
+++ b/src/backend/commands/opclasscmds.c
@@ -27,7 +27,6 @@
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
 #include "catalog/objectaccess.h"
-#include "catalog/opfam_internal.h"
 #include "catalog/pg_am.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
@@ -62,12 +61,10 @@ static void processTypesSpec(List *args, Oid *lefttype, Oid *righttype);
 static void assignOperTypes(OpFamilyMember *member, Oid amoid, Oid typeoid);
 static void assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid,
 							int opclassOptsProcNum);
-static void addFamilyMember(List **list, OpFamilyMember *member, bool isProc);
-static void storeOperators(List *opfamilyname, Oid amoid,
-						   Oid opfamilyoid, Oid opclassoid,
+static void addFamilyMember(List **list, OpFamilyMember *member);
+static void storeOperators(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 						   List *operators, bool isAdd);
-static void storeProcedures(List *opfamilyname, Oid amoid,
-							Oid opfamilyoid, Oid opclassoid,
+static void storeProcedures(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 							List *procedures, bool isAdd);
 static void dropOperators(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 						  List *operators);
@@ -518,11 +515,12 @@ DefineOpClass(CreateOpClassStmt *stmt)
 
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = false;
 				member->object = operOid;
 				member->number = item->number;
 				member->sortfamily = sortfamilyOid;
 				assignOperTypes(member, amoid, typeoid);
-				addFamilyMember(&operators, member, false);
+				addFamilyMember(&operators, member);
 				break;
 			case OPCLASS_ITEM_FUNCTION:
 				if (item->number <= 0 || item->number > maxProcNumber)
@@ -541,6 +539,7 @@ DefineOpClass(CreateOpClassStmt *stmt)
 #endif
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = true;
 				member->object = funcOid;
 				member->number = item->number;
 
@@ -550,7 +549,7 @@ DefineOpClass(CreateOpClassStmt *stmt)
 									 &member->lefttype, &member->righttype);
 
 				assignProcTypes(member, amoid, typeoid, optsProcNumber);
-				addFamilyMember(&procedures, member, true);
+				addFamilyMember(&procedures, member);
 				break;
 			case OPCLASS_ITEM_STORAGETYPE:
 				if (OidIsValid(storageoid))
@@ -662,14 +661,46 @@ DefineOpClass(CreateOpClassStmt *stmt)
 
 	heap_freetuple(tup);
 
+	/*
+	 * Now that we have the opclass OID, set up default dependency info for
+	 * the pg_amop and pg_amproc entries.  Historically, CREATE OPERATOR CLASS
+	 * has created hard dependencies on the opclass, so that's what we use.
+	 */
+	foreach(l, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(l);
+
+		op->ref_is_hard = true;
+		op->ref_is_family = false;
+		op->refobjid = opclassoid;
+	}
+	foreach(l, procedures)
+	{
+		OpFamilyMember *proc = (OpFamilyMember *) lfirst(l);
+
+		proc->ref_is_hard = true;
+		proc->ref_is_family = false;
+		proc->refobjid = opclassoid;
+	}
+
+	/*
+	 * Let the index AM editorialize on the dependency choices.  It could also
+	 * do further validation on the operators and functions, if it likes.
+	 */
+	if (amroutine->amadjustmembers)
+		amroutine->amadjustmembers(opfamilyoid,
+								   opclassoid,
+								   operators,
+								   procedures);
+
 	/*
 	 * Now add tuples to pg_amop and pg_amproc tying in the operators and
 	 * functions.  Dependencies on them are inserted, too.
 	 */
 	storeOperators(stmt->opfamilyname, amoid, opfamilyoid,
-				   opclassoid, operators, false);
+				   operators, false);
 	storeProcedures(stmt->opfamilyname, amoid, opfamilyoid,
-					opclassoid, procedures, false);
+					procedures, false);
 
 	/* let event triggers know what happened */
 	EventTriggerCollectCreateOpClass(stmt, opclassoid, operators, procedures);
@@ -842,6 +873,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 				 int maxOpNumber, int maxProcNumber, int optsProcNumber,
 				 List *items)
 {
+	IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false);
 	List	   *operators;		/* OpFamilyMember list for operators */
 	List	   *procedures;		/* OpFamilyMember list for support procs */
 	ListCell   *l;
@@ -900,11 +932,17 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = false;
 				member->object = operOid;
 				member->number = item->number;
 				member->sortfamily = sortfamilyOid;
+				/* We can set up dependency fields immediately */
+				/* Historically, ALTER ADD has created soft dependencies */
+				member->ref_is_hard = false;
+				member->ref_is_family = true;
+				member->refobjid = opfamilyoid;
 				assignOperTypes(member, amoid, InvalidOid);
-				addFamilyMember(&operators, member, false);
+				addFamilyMember(&operators, member);
 				break;
 			case OPCLASS_ITEM_FUNCTION:
 				if (item->number <= 0 || item->number > maxProcNumber)
@@ -924,8 +962,14 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = true;
 				member->object = funcOid;
 				member->number = item->number;
+				/* We can set up dependency fields immediately */
+				/* Historically, ALTER ADD has created soft dependencies */
+				member->ref_is_hard = false;
+				member->ref_is_family = true;
+				member->refobjid = opfamilyoid;
 
 				/* allow overriding of the function's actual arg types */
 				if (item->class_args)
@@ -933,7 +977,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 									 &member->lefttype, &member->righttype);
 
 				assignProcTypes(member, amoid, InvalidOid, optsProcNumber);
-				addFamilyMember(&procedures, member, true);
+				addFamilyMember(&procedures, member);
 				break;
 			case OPCLASS_ITEM_STORAGETYPE:
 				ereport(ERROR,
@@ -946,14 +990,24 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 		}
 	}
 
+	/*
+	 * Let the index AM editorialize on the dependency choices.  It could also
+	 * do further validation on the operators and functions, if it likes.
+	 */
+	if (amroutine->amadjustmembers)
+		amroutine->amadjustmembers(opfamilyoid,
+								   InvalidOid,	/* no specific opclass */
+								   operators,
+								   procedures);
+
 	/*
 	 * Add tuples to pg_amop and pg_amproc tying in the operators and
 	 * functions.  Dependencies on them are inserted, too.
 	 */
 	storeOperators(stmt->opfamilyname, amoid, opfamilyoid,
-				   InvalidOid, operators, true);
+				   operators, true);
 	storeProcedures(stmt->opfamilyname, amoid, opfamilyoid,
-					InvalidOid, procedures, true);
+					procedures, true);
 
 	/* make information available to event triggers */
 	EventTriggerCollectAlterOpFam(stmt, opfamilyoid,
@@ -996,10 +1050,11 @@ AlterOpFamilyDrop(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 				processTypesSpec(item->class_args, &lefttype, &righttype);
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = false;
 				member->number = item->number;
 				member->lefttype = lefttype;
 				member->righttype = righttype;
-				addFamilyMember(&operators, member, false);
+				addFamilyMember(&operators, member);
 				break;
 			case OPCLASS_ITEM_FUNCTION:
 				if (item->number <= 0 || item->number > maxProcNumber)
@@ -1011,10 +1066,11 @@ AlterOpFamilyDrop(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 				processTypesSpec(item->class_args, &lefttype, &righttype);
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = true;
 				member->number = item->number;
 				member->lefttype = lefttype;
 				member->righttype = righttype;
-				addFamilyMember(&procedures, member, true);
+				addFamilyMember(&procedures, member);
 				break;
 			case OPCLASS_ITEM_STORAGETYPE:
 				/* grammar prevents this from appearing */
@@ -1324,7 +1380,7 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid,
  * duplicated strategy or proc number.
  */
 static void
-addFamilyMember(List **list, OpFamilyMember *member, bool isProc)
+addFamilyMember(List **list, OpFamilyMember *member)
 {
 	ListCell   *l;
 
@@ -1336,7 +1392,7 @@ addFamilyMember(List **list, OpFamilyMember *member, bool isProc)
 			old->lefttype == member->lefttype &&
 			old->righttype == member->righttype)
 		{
-			if (isProc)
+			if (member->is_func)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("function number %d for (%s,%s) appears more than once",
@@ -1358,13 +1414,10 @@ addFamilyMember(List **list, OpFamilyMember *member, bool isProc)
 /*
  * Dump the operators to pg_amop
  *
- * We also make dependency entries in pg_depend for the opfamily entries.
- * If opclassoid is valid then make an INTERNAL dependency on that opclass,
- * else make an AUTO dependency on the opfamily.
+ * We also make dependency entries in pg_depend for the pg_amop entries.
  */
 static void
-storeOperators(List *opfamilyname, Oid amoid,
-			   Oid opfamilyoid, Oid opclassoid,
+storeOperators(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 			   List *operators, bool isAdd)
 {
 	Relation	rel;
@@ -1434,28 +1487,17 @@ storeOperators(List *opfamilyname, Oid amoid,
 		referenced.objectId = op->object;
 		referenced.objectSubId = 0;
 
-		if (OidIsValid(opclassoid))
-		{
-			/* if contained in an opclass, use a NORMAL dep on operator */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
+		/* see comments in amapi.h about dependency strength */
+		recordDependencyOn(&myself, &referenced,
+						   op->ref_is_hard ? DEPENDENCY_NORMAL : DEPENDENCY_AUTO);
 
-			/* ... and an INTERNAL dep on the opclass */
-			referenced.classId = OperatorClassRelationId;
-			referenced.objectId = opclassoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
-		}
-		else
-		{
-			/* if "loose" in the opfamily, use a AUTO dep on operator */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
+		referenced.classId = op->ref_is_family ? OperatorFamilyRelationId :
+			OperatorClassRelationId;
+		referenced.objectId = op->refobjid;
+		referenced.objectSubId = 0;
 
-			/* ... and an AUTO dep on the opfamily */
-			referenced.classId = OperatorFamilyRelationId;
-			referenced.objectId = opfamilyoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
-		}
+		recordDependencyOn(&myself, &referenced,
+						   op->ref_is_hard ? DEPENDENCY_INTERNAL : DEPENDENCY_AUTO);
 
 		/* A search operator also needs a dep on the referenced opfamily */
 		if (OidIsValid(op->sortfamily))
@@ -1463,8 +1505,11 @@ storeOperators(List *opfamilyname, Oid amoid,
 			referenced.classId = OperatorFamilyRelationId;
 			referenced.objectId = op->sortfamily;
 			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
+
+			recordDependencyOn(&myself, &referenced,
+							   op->ref_is_hard ? DEPENDENCY_NORMAL : DEPENDENCY_AUTO);
 		}
+
 		/* Post create hook of this access method operator */
 		InvokeObjectPostCreateHook(AccessMethodOperatorRelationId,
 								   entryoid, 0);
@@ -1476,13 +1521,10 @@ storeOperators(List *opfamilyname, Oid amoid,
 /*
  * Dump the procedures (support routines) to pg_amproc
  *
- * We also make dependency entries in pg_depend for the opfamily entries.
- * If opclassoid is valid then make an INTERNAL dependency on that opclass,
- * else make an AUTO dependency on the opfamily.
+ * We also make dependency entries in pg_depend for the pg_amproc entries.
  */
 static void
-storeProcedures(List *opfamilyname, Oid amoid,
-				Oid opfamilyoid, Oid opclassoid,
+storeProcedures(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 				List *procedures, bool isAdd)
 {
 	Relation	rel;
@@ -1546,28 +1588,18 @@ storeProcedures(List *opfamilyname, Oid amoid,
 		referenced.objectId = proc->object;
 		referenced.objectSubId = 0;
 
-		if (OidIsValid(opclassoid))
-		{
-			/* if contained in an opclass, use a NORMAL dep on procedure */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
+		/* see comments in amapi.h about dependency strength */
+		recordDependencyOn(&myself, &referenced,
+						   proc->ref_is_hard ? DEPENDENCY_NORMAL : DEPENDENCY_AUTO);
 
-			/* ... and an INTERNAL dep on the opclass */
-			referenced.classId = OperatorClassRelationId;
-			referenced.objectId = opclassoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
-		}
-		else
-		{
-			/* if "loose" in the opfamily, use a AUTO dep on procedure */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
+		referenced.classId = proc->ref_is_family ? OperatorFamilyRelationId :
+			OperatorClassRelationId;
+		referenced.objectId = proc->refobjid;
+		referenced.objectSubId = 0;
+
+		recordDependencyOn(&myself, &referenced,
+						   proc->ref_is_hard ? DEPENDENCY_INTERNAL : DEPENDENCY_AUTO);
 
-			/* ... and an AUTO dep on the opfamily */
-			referenced.classId = OperatorFamilyRelationId;
-			referenced.objectId = opfamilyoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
-		}
 		/* Post create hook of access method procedure */
 		InvokeObjectPostCreateHook(AccessMethodProcedureRelationId,
 								   entryoid, 0);
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index e116235769b26..ec636620601e2 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -524,6 +524,8 @@
 						 FUNCTION 1 (int4, int4) btint4cmp(int4,int4),
 						 FUNCTION 2 (int4, int4) btint4sortsupport(internal),
 						 FUNCTION 4 (int4, int4) btequalimage(oid);',
+		# note: it's correct that btint8sortsupport and bigint btequalimage
+		# are included here:
 		regexp => qr/^
 			\QALTER OPERATOR FAMILY dump_test.op_family USING btree ADD\E\n\s+
 			\QOPERATOR 1 <(bigint,integer) ,\E\n\s+
@@ -532,7 +534,9 @@
 			\QOPERATOR 4 >=(bigint,integer) ,\E\n\s+
 			\QOPERATOR 5 >(bigint,integer) ,\E\n\s+
 			\QFUNCTION 1 (integer, integer) btint4cmp(integer,integer) ,\E\n\s+
+			\QFUNCTION 2 (bigint, bigint) btint8sortsupport(internal) ,\E\n\s+
 			\QFUNCTION 2 (integer, integer) btint4sortsupport(internal) ,\E\n\s+
+			\QFUNCTION 4 (bigint, bigint) btequalimage(oid) ,\E\n\s+
 			\QFUNCTION 4 (integer, integer) btequalimage(oid);\E
 			/xm,
 		like =>
@@ -1559,6 +1563,8 @@
 						 FUNCTION 1 btint8cmp(bigint,bigint),
 						 FUNCTION 2 btint8sortsupport(internal),
 						 FUNCTION 4 btequalimage(oid);',
+		# note: it's correct that btint8sortsupport and btequalimage
+		# are NOT included here (they're optional support functions):
 		regexp => qr/^
 			\QCREATE OPERATOR CLASS dump_test.op_class\E\n\s+
 			\QFOR TYPE bigint USING btree FAMILY dump_test.op_family AS\E\n\s+
@@ -1567,9 +1573,7 @@
 			\QOPERATOR 3 =(bigint,bigint) ,\E\n\s+
 			\QOPERATOR 4 >=(bigint,bigint) ,\E\n\s+
 			\QOPERATOR 5 >(bigint,bigint) ,\E\n\s+
-			\QFUNCTION 1 (bigint, bigint) btint8cmp(bigint,bigint) ,\E\n\s+
-			\QFUNCTION 2 (bigint, bigint) btint8sortsupport(internal) ,\E\n\s+
-			\QFUNCTION 4 (bigint, bigint) btequalimage(oid);\E
+			\QFUNCTION 1 (bigint, bigint) btint8cmp(bigint,bigint);\E
 			/xm,
 		like =>
 		  { %full_runs, %dump_test_schema_runs, section_pre_data => 1, },
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 4325faa460bdc..85b4766016f89 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -54,6 +54,42 @@ typedef enum IndexAMProperty
 	AMPROP_CAN_INCLUDE
 } IndexAMProperty;
 
+/*
+ * We use lists of this struct type to keep track of both operators and
+ * support functions while building or adding to an opclass or opfamily.
+ * amadjustmembers functions receive lists of these structs, and are allowed
+ * to alter their "ref" fields.
+ *
+ * The "ref" fields define how the pg_amop or pg_amproc entry should depend
+ * on the associated objects (that is, which dependency type to use, and
+ * which opclass or opfamily it should depend on).
+ *
+ * If ref_is_hard is true, the entry will have a NORMAL dependency on the
+ * operator or support func, and an INTERNAL dependency on the opclass or
+ * opfamily.  This forces the opclass or opfamily to be dropped if the
+ * operator or support func is dropped, and requires the CASCADE option
+ * to do so.  Nor will ALTER OPERATOR FAMILY DROP be allowed.  This is
+ * the right behavior for objects that are essential to an opclass.
+ *
+ * If ref_is_hard is false, the entry will have an AUTO dependency on the
+ * operator or support func, and also an AUTO dependency on the opclass or
+ * opfamily.  This allows ALTER OPERATOR FAMILY DROP, and causes that to
+ * happen automatically if the operator or support func is dropped.  This
+ * is the right behavior for inessential ("loose") objects.
+ */
+typedef struct OpFamilyMember
+{
+	bool		is_func;		/* is this an operator, or support func? */
+	Oid			object;			/* operator or support func's OID */
+	int			number;			/* strategy or support func number */
+	Oid			lefttype;		/* lefttype */
+	Oid			righttype;		/* righttype */
+	Oid			sortfamily;		/* ordering operator's sort opfamily, or 0 */
+	bool		ref_is_hard;	/* hard or soft dependency? */
+	bool		ref_is_family;	/* is dependency on opclass or opfamily? */
+	Oid			refobjid;		/* OID of opclass or opfamily */
+} OpFamilyMember;
+
 
 /*
  * Callback function signatures --- see indexam.sgml for more info.
@@ -114,6 +150,12 @@ typedef char *(*ambuildphasename_function) (int64 phasenum);
 /* validate definition of an opclass for this AM */
 typedef bool (*amvalidate_function) (Oid opclassoid);
 
+/* validate operators and support functions to be added to an opclass/family */
+typedef void (*amadjustmembers_function) (Oid opfamilyoid,
+										  Oid opclassoid,
+										  List *operators,
+										  List *functions);
+
 /* prepare for index scan */
 typedef IndexScanDesc (*ambeginscan_function) (Relation indexRelation,
 											   int nkeys,
@@ -224,6 +266,7 @@ typedef struct IndexAmRoutine
 	amproperty_function amproperty; /* can be NULL */
 	ambuildphasename_function ambuildphasename; /* can be NULL */
 	amvalidate_function amvalidate;
+	amadjustmembers_function amadjustmembers;	/* can be NULL */
 	ambeginscan_function ambeginscan;
 	amrescan_function amrescan;
 	amgettuple_function amgettuple; /* can be NULL */
diff --git a/src/include/access/amvalidate.h b/src/include/access/amvalidate.h
index f3a0e52d84ecc..149fc75f85698 100644
--- a/src/include/access/amvalidate.h
+++ b/src/include/access/amvalidate.h
@@ -1,7 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * amvalidate.h
- *	  Support routines for index access methods' amvalidate functions.
+ *	  Support routines for index access methods' amvalidate and
+ *	  amadjustmembers functions.
  *
  * Copyright (c) 2016-2020, PostgreSQL Global Development Group
  *
@@ -32,6 +33,8 @@ extern bool check_amproc_signature(Oid funcid, Oid restype, bool exact,
 extern bool check_amoptsproc_signature(Oid funcid);
 extern bool check_amop_signature(Oid opno, Oid restype,
 								 Oid lefttype, Oid righttype);
+extern Oid	opclass_for_family_datatype(Oid amoid, Oid opfamilyoid,
+										Oid datatypeoid);
 extern bool opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid);
 
 #endif							/* AMVALIDATE_H */
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index 71eeac205c9d9..5cb2f72e4cf80 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -407,6 +407,10 @@ extern ItemPointer ginVacuumItemPointers(GinVacuumState *gvs,
 
 /* ginvalidate.c */
 extern bool ginvalidate(Oid opclassoid);
+extern void ginadjustmembers(Oid opfamilyoid,
+							 Oid opclassoid,
+							 List *operators,
+							 List *functions);
 
 /* ginbulk.c */
 typedef struct GinEntryAccumulator
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 4bfc6280002b8..02e985549f635 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -464,6 +464,10 @@ extern bool gistcanreturn(Relation index, int attno);
 
 /* gistvalidate.c */
 extern bool gistvalidate(Oid opclassoid);
+extern void gistadjustmembers(Oid opfamilyoid,
+							  Oid opclassoid,
+							  List *operators,
+							  List *functions);
 
 /* gistutil.c */
 
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 7e7b1b73d86d1..bab4d9f1b05cb 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -379,6 +379,10 @@ extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
 												IndexBulkDeleteResult *stats);
 extern bytea *hashoptions(Datum reloptions, bool validate);
 extern bool hashvalidate(Oid opclassoid);
+extern void hashadjustmembers(Oid opfamilyoid,
+							  Oid opclassoid,
+							  List *operators,
+							  List *functions);
 
 /* private routines */
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5274cc750832..65d9698b899ac 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1141,6 +1141,10 @@ extern bool _bt_allequalimage(Relation rel, bool debugmessage);
  * prototypes for functions in nbtvalidate.c
  */
 extern bool btvalidate(Oid opclassoid);
+extern void btadjustmembers(Oid opfamilyoid,
+							Oid opclassoid,
+							List *operators,
+							List *functions);
 
 /*
  * prototypes for functions in nbtsort.c
diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h
index 852d1e2961a79..9f2ccc1730f99 100644
--- a/src/include/access/spgist.h
+++ b/src/include/access/spgist.h
@@ -220,5 +220,9 @@ extern IndexBulkDeleteResult *spgvacuumcleanup(IndexVacuumInfo *info,
 
 /* spgvalidate.c */
 extern bool spgvalidate(Oid opclassoid);
+extern void spgadjustmembers(Oid opfamilyoid,
+							 Oid opclassoid,
+							 List *operators,
+							 List *functions);
 
 #endif							/* SPGIST_H */
diff --git a/src/include/catalog/opfam_internal.h b/src/include/catalog/opfam_internal.h
deleted file mode 100644
index d63bd9ffa3c22..0000000000000
--- a/src/include/catalog/opfam_internal.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * opfam_internal.h
- *
- * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * src/include/catalog/opfam_internal.h
- *
- *-------------------------------------------------------------------------
- */
-#ifndef OPFAM_INTERNAL_H
-#define OPFAM_INTERNAL_H
-
-/*
- * We use lists of this struct type to keep track of both operators and
- * procedures while building or adding to an opfamily.
- */
-typedef struct
-{
-	Oid			object;			/* operator or support proc's OID */
-	int			number;			/* strategy or support proc number */
-	Oid			lefttype;		/* lefttype */
-	Oid			righttype;		/* righttype */
-	Oid			sortfamily;		/* ordering operator's sort opfamily, or 0 */
-} OpFamilyMember;
-
-#endif							/* OPFAM_INTERNAL_H */

From cd5e82256de5895595cdd99ecb03aea15b346f71 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 1 Aug 2020 15:31:01 -0700
Subject: [PATCH 53/54] Change XID and mxact limits to warn at 40M and stop at
 3M.

We have edge-case bugs when assigning values in the last few dozen pages
before the wrap limit.  We may introduce similar bugs in the future.  At
default BLCKSZ, this makes such bugs unreachable outside of single-user
mode.  Also, when VACUUM began to consume mxacts, multiStopLimit did not
change to compensate.

pg_upgrade may fail on a cluster that was already printing "must be
vacuumed" warnings.  Follow the warning's instructions to clear the
warning, then run pg_upgrade again.  One can still, peacefully consume
98% of XIDs or mxacts, so DBAs need not change routine VACUUM settings.

Discussion: https://postgr.es/m/20200621083513.GA3074645@rfd.leadboat.com
---
 doc/src/sgml/maintenance.sgml          |  8 ++++----
 src/backend/access/transam/multixact.c | 24 ++++++++++-------------
 src/backend/access/transam/varsup.c    | 27 ++++++++++++++------------
 3 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml
index 4acdd15d4b379..de0794adeb90b 100644
--- a/doc/src/sgml/maintenance.sgml
+++ b/doc/src/sgml/maintenance.sgml
@@ -608,10 +608,10 @@ SELECT datname, age(datfrozenxid) FROM pg_database;
    <para>
     If for some reason autovacuum fails to clear old XIDs from a table, the
     system will begin to emit warning messages like this when the database's
-    oldest XIDs reach eleven million transactions from the wraparound point:
+    oldest XIDs reach forty million transactions from the wraparound point:
 
 <programlisting>
-WARNING:  database "mydb" must be vacuumed within 10985967 transactions
+WARNING:  database "mydb" must be vacuumed within 39985967 transactions
 HINT:  To avoid a database shutdown, execute a database-wide VACUUM in that database.
 </programlisting>
 
@@ -621,7 +621,7 @@ HINT:  To avoid a database shutdown, execute a database-wide VACUUM in that data
     be able to advance the database's <structfield>datfrozenxid</structfield>.)
     If these warnings are
     ignored, the system will shut down and refuse to start any new
-    transactions once there are fewer than 1 million transactions left
+    transactions once there are fewer than three million transactions left
     until wraparound:
 
 <programlisting>
@@ -629,7 +629,7 @@ ERROR:  database is not accepting commands to avoid wraparound data loss in data
 HINT:  Stop the postmaster and vacuum that database in single-user mode.
 </programlisting>
 
-    The 1-million-transaction safety margin exists to let the
+    The three-million-transaction safety margin exists to let the
     administrator recover without data loss, by manually executing the
     required <command>VACUUM</command> commands.  However, since the system will not
     execute commands once it has gone into the safety shutdown mode,
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index ce84dac0c400c..475f5ed86110a 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -2217,28 +2217,24 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
 		multiWrapLimit += FirstMultiXactId;
 
 	/*
-	 * We'll refuse to continue assigning MultiXactIds once we get within 100
-	 * multi of data loss.
-	 *
-	 * Note: This differs from the magic number used in
-	 * SetTransactionIdLimit() since vacuum itself will never generate new
-	 * multis.  XXX actually it does, if it needs to freeze old multis.
+	 * We'll refuse to continue assigning MultiXactIds once we get within 3M
+	 * multi of data loss.  See SetTransactionIdLimit.
 	 */
-	multiStopLimit = multiWrapLimit - 100;
+	multiStopLimit = multiWrapLimit - 3000000;
 	if (multiStopLimit < FirstMultiXactId)
 		multiStopLimit -= FirstMultiXactId;
 
 	/*
-	 * We'll start complaining loudly when we get within 10M multis of the
-	 * stop point.   This is kind of arbitrary, but if you let your gas gauge
-	 * get down to 1% of full, would you be looking for the next gas station?
-	 * We need to be fairly liberal about this number because there are lots
-	 * of scenarios where most transactions are done by automatic clients that
-	 * won't pay attention to warnings. (No, we're not gonna make this
+	 * We'll start complaining loudly when we get within 40M multis of data
+	 * loss.  This is kind of arbitrary, but if you let your gas gauge get
+	 * down to 2% of full, would you be looking for the next gas station?  We
+	 * need to be fairly liberal about this number because there are lots of
+	 * scenarios where most transactions are done by automatic clients that
+	 * won't pay attention to warnings.  (No, we're not gonna make this
 	 * configurable.  If you know enough to configure it, you know enough to
 	 * not get in this kind of trouble in the first place.)
 	 */
-	multiWarnLimit = multiStopLimit - 10000000;
+	multiWarnLimit = multiWrapLimit - 40000000;
 	if (multiWarnLimit < FirstMultiXactId)
 		multiWarnLimit -= FirstMultiXactId;
 
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index e14b53bf9e352..0142bc70f6a63 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -350,27 +350,30 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
 
 	/*
 	 * We'll refuse to continue assigning XIDs in interactive mode once we get
-	 * within 1M transactions of data loss.  This leaves lots of room for the
+	 * within 3M transactions of data loss.  This leaves lots of room for the
 	 * DBA to fool around fixing things in a standalone backend, while not
 	 * being significant compared to total XID space. (Note that since
 	 * vacuuming requires one transaction per table cleaned, we had better be
-	 * sure there's lots of XIDs left...)
+	 * sure there's lots of XIDs left...)  Also, at default BLCKSZ, this
+	 * leaves two completely-idle segments.  In the event of edge-case bugs
+	 * involving page or segment arithmetic, idle segments render the bugs
+	 * unreachable outside of single-user mode.
 	 */
-	xidStopLimit = xidWrapLimit - 1000000;
+	xidStopLimit = xidWrapLimit - 3000000;
 	if (xidStopLimit < FirstNormalTransactionId)
 		xidStopLimit -= FirstNormalTransactionId;
 
 	/*
-	 * We'll start complaining loudly when we get within 10M transactions of
-	 * the stop point.  This is kind of arbitrary, but if you let your gas
-	 * gauge get down to 1% of full, would you be looking for the next gas
-	 * station?  We need to be fairly liberal about this number because there
-	 * are lots of scenarios where most transactions are done by automatic
-	 * clients that won't pay attention to warnings. (No, we're not gonna make
-	 * this configurable.  If you know enough to configure it, you know enough
-	 * to not get in this kind of trouble in the first place.)
+	 * We'll start complaining loudly when we get within 40M transactions of
+	 * data loss.  This is kind of arbitrary, but if you let your gas gauge
+	 * get down to 2% of full, would you be looking for the next gas station?
+	 * We need to be fairly liberal about this number because there are lots
+	 * of scenarios where most transactions are done by automatic clients that
+	 * won't pay attention to warnings.  (No, we're not gonna make this
+	 * configurable.  If you know enough to configure it, you know enough to
+	 * not get in this kind of trouble in the first place.)
 	 */
-	xidWarnLimit = xidStopLimit - 10000000;
+	xidWarnLimit = xidWrapLimit - 40000000;
 	if (xidWarnLimit < FirstNormalTransactionId)
 		xidWarnLimit -= FirstNormalTransactionId;
 

From 6ee3b5fb990ea11992b0db960d79b1fbe7b5e8e5 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Sun, 2 Aug 2020 14:24:46 +1200
Subject: [PATCH 54/54] Use int64 instead of long in incremental sort code

Windows 64bit has 4-byte long values which is not suitable for tracking
disk space usage in the incremental sort code. Let's just make all these
fields int64s.

Author: James Coleman
Discussion: https://postgr.es/m/CAApHDvpky%2BUhof8mryPf5i%3D6e6fib2dxHqBrhp0Qhu0NeBhLJw%40mail.gmail.com
Backpatch-through: 13, where the incremental sort code was added
---
 src/backend/commands/explain.c | 20 ++++++++++----------
 src/include/nodes/execnodes.h  |  8 ++++----
 src/include/utils/tuplesort.h  |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 54e3797a15b62..1e565fd337552 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2676,7 +2676,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 		TuplesortInstrumentation stats;
 		const char *sortMethod;
 		const char *spaceType;
-		long		spaceUsed;
+		int64		spaceUsed;
 
 		tuplesort_get_stats(state, &stats);
 		sortMethod = tuplesort_method_name(stats.sortMethod);
@@ -2686,7 +2686,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 		if (es->format == EXPLAIN_FORMAT_TEXT)
 		{
 			ExplainIndentText(es);
-			appendStringInfo(es->str, "Sort Method: %s  %s: %ldkB\n",
+			appendStringInfo(es->str, "Sort Method: %s  %s: " INT64_FORMAT "kB\n",
 							 sortMethod, spaceType, spaceUsed);
 		}
 		else
@@ -2715,7 +2715,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 			TuplesortInstrumentation *sinstrument;
 			const char *sortMethod;
 			const char *spaceType;
-			long		spaceUsed;
+			int64		spaceUsed;
 
 			sinstrument = &sortstate->shared_info->sinstrument[n];
 			if (sinstrument->sortMethod == SORT_TYPE_STILL_IN_PROGRESS)
@@ -2731,7 +2731,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 			{
 				ExplainIndentText(es);
 				appendStringInfo(es->str,
-								 "Sort Method: %s  %s: %ldkB\n",
+								 "Sort Method: %s  %s: " INT64_FORMAT "kB\n",
 								 sortMethod, spaceType, spaceUsed);
 			}
 			else
@@ -2795,23 +2795,23 @@ show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
 
 		if (groupInfo->maxMemorySpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
 			const char *spaceTypeName;
 
 			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
-			appendStringInfo(es->str, "  Average %s: %ldkB  Peak %s: %ldkB",
+			appendStringInfo(es->str, "  Average %s: " INT64_FORMAT "kB  Peak %s: " INT64_FORMAT "kB",
 							 spaceTypeName, avgSpace,
 							 spaceTypeName, groupInfo->maxMemorySpaceUsed);
 		}
 
 		if (groupInfo->maxDiskSpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
 
 			const char *spaceTypeName;
 
 			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
-			appendStringInfo(es->str, "  Average %s: %ldkB  Peak %s: %ldkB",
+			appendStringInfo(es->str, "  Average %s: " INT64_FORMAT "kB  Peak %s: " INT64_FORMAT "kB",
 							 spaceTypeName, avgSpace,
 							 spaceTypeName, groupInfo->maxDiskSpaceUsed);
 		}
@@ -2829,7 +2829,7 @@ show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
 
 		if (groupInfo->maxMemorySpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
 			const char *spaceTypeName;
 			StringInfoData memoryName;
 
@@ -2846,7 +2846,7 @@ show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
 		}
 		if (groupInfo->maxDiskSpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
 			const char *spaceTypeName;
 			StringInfoData diskName;
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 6f96b31fb4383..cf832d7f90975 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2032,10 +2032,10 @@ typedef struct SortState
 typedef struct IncrementalSortGroupInfo
 {
 	int64		groupCount;
-	long		maxDiskSpaceUsed;
-	long		totalDiskSpaceUsed;
-	long		maxMemorySpaceUsed;
-	long		totalMemorySpaceUsed;
+	int64		maxDiskSpaceUsed;
+	int64		totalDiskSpaceUsed;
+	int64		maxMemorySpaceUsed;
+	int64		totalMemorySpaceUsed;
 	bits32		sortMethods;	/* bitmask of TuplesortMethod */
 } IncrementalSortGroupInfo;
 
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index d992b4875a5e8..9e76666fe9483 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -90,7 +90,7 @@ typedef struct TuplesortInstrumentation
 {
 	TuplesortMethod sortMethod; /* sort algorithm used */
 	TuplesortSpaceType spaceType;	/* type of space spaceUsed represents */
-	long		spaceUsed;		/* space consumption, in kB */
+	int64		spaceUsed;		/* space consumption, in kB */
 } TuplesortInstrumentation;