test-tablesync-perf-v4.sh
application/octet-stream
Filename: test-tablesync-perf-v4.sh
Type: application/octet-stream
Part: 1
#!/bin/bash
# Configuration
PUB_DIR="/home/ajin/ajin-setup/dataoss"
SUB_DIR="/home/ajin/ajin-setup/dataoss2"
PUB_PORT=6972
SUB_PORT=6973
REPL_USER="repl_user"
REPL_DB="repl_db"
LOG_FILE="tablesync_test.log"
# Helper function to stop and clean PostgreSQL
cleanup_pg() {
local dir=$1
local port=$2
echo "Stopping PostgreSQL in $dir (port $port)..."
pg_ctl -D "$dir" -o "-p $port" stop -m immediate
echo "Cleaning up $dir..."
rm -rf "$dir"
}
# Initialize PostgreSQL cluster
init_pg() {
local dir=$1
echo "Initializing PostgreSQL cluster in $dir..."
initdb -D "$dir" --no-locale --encoding=UTF8
cat << EOF >> "$dir"/postgresql.conf
wal_level = logical
max_wal_senders = 10
max_replication_slots = 10
log_statement = 'all'
log_min_messages = 'debug1'
log_duration = on
log_min_duration_statement = 0
EOF
}
# Start PostgreSQL
start_pg() {
local dir=$1
local port=$2
echo "Starting PostgreSQL in $dir on port $port..."
pg_ctl -D "$dir" -o "-p $port" -l "$dir/logfile" start
sleep 3
}
# Create partitioned table and replication setup
setup_replication() {
echo "Setting up replication..."
# Publisher setup
psql -p $PUB_PORT -d postgres -c "CREATE DATABASE $REPL_DB;"
# Create 10 partitioned tables on publisher
for i in {1..10}; do
psql -p $PUB_PORT -d $REPL_DB <<EOF
CREATE TABLE sales${i} (
id SERIAL NOT NULL,
sale_date DATE NOT NULL,
region TEXT NOT NULL,
amount NUMERIC(10,2) NOT NULL,
category TEXT NOT NULL,
PRIMARY KEY (id, sale_date,region)
) PARTITION BY RANGE (sale_date);
CREATE TABLE sales${i}_2023 PARTITION OF sales${i}
FOR VALUES FROM ('2023-01-01') TO ('2024-01-01')
PARTITION BY HASH (region);
CREATE TABLE sales${i}_2024 PARTITION OF sales${i}
FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')
PARTITION BY HASH (region);
CREATE TABLE sales${i}_2023_part1 PARTITION OF sales${i}_2023
FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE sales${i}_2023_part2 PARTITION OF sales${i}_2023
FOR VALUES WITH (MODULUS 2, REMAINDER 1);
CREATE TABLE sales${i}_2024_part1 PARTITION OF sales${i}_2024
FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE sales${i}_2024_part2 PARTITION OF sales${i}_2024
FOR VALUES WITH (MODULUS 2, REMAINDER 1);
EOF
done
psql -p $PUB_PORT -d $REPL_DB -c "CREATE PUBLICATION pub FOR ALL TABLES WITH (publish_via_partition_root = true);"
echo "Inserting data ..."
# Insert data into all 10 tables
for i in {1..10}; do
psql -p $PUB_PORT -d $REPL_DB -c "INSERT INTO sales${i} (sale_date, region, amount, category)
SELECT
('2023-01-01'::DATE + (random() * 730)::int) AS sale_date,
CASE WHEN random() > 0.5 THEN 'North' ELSE 'South' END AS region,
(random() * 1000)::NUMERIC(10,2) AS amount,
CASE WHEN random() > 0.5 THEN 'Electronics' ELSE 'Furniture' END AS category
FROM generate_series(1, 500);"
done
# Subscriber setup
psql -p $SUB_PORT -d postgres -c "CREATE DATABASE $REPL_DB;"
# Create 10 partitioned tables on subscriber
for i in {1..10}; do
psql -p $SUB_PORT -d $REPL_DB <<EOF
CREATE TABLE sales${i} (
id SERIAL NOT NULL,
sale_date DATE NOT NULL,
region TEXT NOT NULL,
amount NUMERIC(10,2) NOT NULL,
category TEXT NOT NULL,
PRIMARY KEY (id, sale_date,region)
) PARTITION BY RANGE (sale_date);
CREATE TABLE sales${i}_2023 PARTITION OF sales${i}
FOR VALUES FROM ('2023-01-01') TO ('2024-01-01')
PARTITION BY HASH (region);
CREATE TABLE sales${i}_2024 PARTITION OF sales${i}
FOR VALUES FROM ('2024-01-01') TO ('2025-01-01')
PARTITION BY HASH (region);
CREATE TABLE sales${i}_2023_part1 PARTITION OF sales${i}_2023
FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE sales${i}_2023_part2 PARTITION OF sales${i}_2023
FOR VALUES WITH (MODULUS 2, REMAINDER 1);
CREATE TABLE sales${i}_2024_part1 PARTITION OF sales${i}_2024
FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE sales${i}_2024_part2 PARTITION OF sales${i}_2024
FOR VALUES WITH (MODULUS 2, REMAINDER 1);
EOF
done
psql -p $SUB_PORT -d $REPL_DB -c "CREATE SUBSCRIPTION sub CONNECTION 'host=localhost port=$PUB_PORT dbname=$REPL_DB user=$USER' PUBLICATION pub;"
}
# Measure tablesync time
measure_tablesync() {
echo "Measuring tablesync time..."
while true; do
sync_status=$(psql -p $SUB_PORT -d $REPL_DB -t -c "SELECT count(*) FROM pg_subscription_rel WHERE srsubstate != 'r';" | xargs)
if [ "$sync_status" -eq 0 ]; then
break
fi
sleep 1
done
# Grep logs for all 10 tables
for i in {1..10}; do
grep -E -A1 "LOG: statement: COPY public.sales${i}|\(SELECT id, sale_date, region, amount, category FROM public.sales${i}\) TO STDOUT" $PUB_DIR/logfile
done
}
# Main execution
cleanup_pg "$PUB_DIR" "$PUB_PORT"
cleanup_pg "$SUB_DIR" "$SUB_PORT"
init_pg "$PUB_DIR"
init_pg "$SUB_DIR"
start_pg "$PUB_DIR" "$PUB_PORT"
start_pg "$SUB_DIR" "$SUB_PORT"
setup_replication
measure_tablesync
source ~/scripts/log_analyzer_functions.sh
calculate_sync_times $SUB_DIR/logfile