breaking_utf8.py
text/x-python
Filename: breaking_utf8.py
Type: text/x-python
Part: 0
import psycopg
DB_CONN_STRING = "dbname=badutf8 port=5435"
con = psycopg.connect(DB_CONN_STRING)
cur = con.cursor()
def setup_database():
with psycopg.connect(DB_CONN_STRING) as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS bademail (
id SERIAL PRIMARY KEY,
subject TEXT,
sender TEXT,
date TEXT,
body TEXT
);
""")
conn.commit()
query = "INSERT INTO bademail (subject, sender, date, body) VALUES (%s, %s, %s, %s)"
params = [
'Re: [PATCH] Add RetrieveInstrumentation hook for CustomScan providers',
'Siddharth Kothari <sidkot@google.com>',
'Wed, 08 Apr 2026 10:57:16 +0530',
"Hi everyone,\n\nI\xe2\x80\x99m just checking in to see if anyone has had a chance to look at this or\nif there\xe2\x80\x99s any further information I should provide to help with the\nreview. I have also added the patch to PG20-1 CF queue, the link is\nhttps://commitfest.postgresql.org/patch/6524/.\n\nThanks,\nSiddharth\n\nOn Wed, Feb 18, 2026 at 3:09\xe2\x80\xafPM Siddharth Kothari <sidkot@google.com> wrote:\n\n> Dear PostgreSQL Hackers,\n>\n> This email proposes a patch to enhance the CustomScan provider interface.\n> The patch file,\n> 0001-Add-RetrieveInstrumentationCustomScan-hook-for-Custo.patch, is\n> attached.\n>\n> *Problem:*\n>\n> CustomScan providers currently lack a standard method to aggregate\n> instrumentation data from parallel workers back to the leader process\n> before the Dynamic Shared Memory (DSM) segment is unlinked. This makes it\n> difficult to gather comprehensive performance metrics from parallelized\n> custom scans.\n>\n> *Solution:*\n>\n> This patch introduces a new optional hook,\n> RetrieveInstrumentationCustomScan, to the CustomExecMethods struct. This\n> hook allows custom scan providers to implement logic to collect and\n> consolidate instrumentation from shared memory or worker states during the\n> parallel query cleanup phase. This hook is invoked via the new\n> ExecCustomScanRetrieveInstrumentation function, called from\n> ExecParallelRetrieveInstrumentation for T_CustomScanState nodes. Since\n> the hook is optional (checked for NULL before calling), it maintains full\n> backward compatibility.\n>\n> *Testing & Compatibility:*\n>\n> - The patch compiles and passes all core regression tests (make\n> check-world) on my x86_64 instance.\n> - The changes are not platform-specific.\n> - Regression Tests: This patch provides a new *capability* for custom\n> scan providers. Since the hook\'s functionality is only realized when\n> implemented by an extension, specific tests would naturally reside within\n> that extension rather than in the core regression suite.\n>\n> This patch does not directly address a specific item on the official TODO\n> list but enhances the extensibility framework.\n>\n> I believe this patch is complete and ready for review. I look forward to\n> any feedback and am happy to make revisions. I will also add this patch to\n> the next CommitFest.\n>\n> Thank you,\n>\n> Siddharth Kothari\n>)"
]
def insert_bad_data():
with psycopg.connect(DB_CONN_STRING) as conn:
with conn.cursor() as cur:
cur.execute(query, params)
conn.commit()
if __name__ == "__main__":
setup_database()
insert_bad_data()
"""
badutf8=# select ctid, id, substring(body from '^.{4}') from bademail;
ctid │ id │ substring
───────┼────┼───────────
(0,1) │ 1 │ Hi e
(1 row)
badutf8=# select ctid, id, substring(body, 1, 4) from bademail;
ERROR: 22021: invalid byte sequence for encoding "UTF8": 0xc2
LOCATION: report_invalid_encoding_int, mbutils.c:1847
badutf8=# select ctid, id, substring(normalize(body), 1, 4) from bademail;
ctid │ id │ substring
───────┼────┼───────────
(0,1) │ 1 │ Hi e
(1 row)
badutf8=# select ctid, id, length(body), length(normalize(body)), body=normalize(body) from bademail;
ctid │ id │ length │ length │ ?column?
───────┼────┼────────┼────────┼──────────
(0,1) │ 1 │ 2314 │ 2314 │ t
(1 row)
"""