PostgreSQL-16-实战案例与最佳实践

文档概述

本文档汇总 PostgreSQL 在实际生产环境中的使用示例、性能调优经验、故障诊断方法和开发最佳实践。内容涵盖从基础配置到高级优化的各个方面,帮助开发者和DBA快速掌握 PostgreSQL 的最佳使用方式。


1. 基础配置与环境搭建

1.1 生产环境安装

环境要求

  • OS: Linux (Ubuntu 22.04 / CentOS 8)
  • CPU: 8 核以上
  • 内存: 64GB 以上
  • 磁盘: SSD/NVMe,RAID 10 配置

编译安装

# 安装依赖
sudo apt-get install -y build-essential libreadline-dev zlib1g-dev \
     flex bison libxml2-dev libxslt-dev libssl-dev libxml2-utils \
     xsltproc libicu-dev pkg-config

# 下载源码
wget https://ftp.postgresql.org/pub/source/v16.1/postgresql-16.1.tar.gz
tar -xzvf postgresql-16.1.tar.gz
cd postgresql-16.1

# 配置(启用 ICU、OpenSSL、LLVM JIT)
./configure --prefix=/usr/local/pgsql-16 \
            --with-openssl \
            --with-icu \
            --with-llvm \
            --enable-debug \
            --enable-cassert

# 编译与安装
make -j8
sudo make install

# 创建 postgres 用户
sudo useradd -m -s /bin/bash postgres
sudo mkdir -p /data/pgdata
sudo chown postgres:postgres /data/pgdata

初始化数据库

# 切换到 postgres 用户
sudo su - postgres

# 初始化数据目录
/usr/local/pgsql-16/bin/initdb -D /data/pgdata \
    --encoding=UTF8 \
    --locale=en_US.UTF-8 \
    --data-checksums \
    --wal-segsize=64

# 启动数据库
/usr/local/pgsql-16/bin/pg_ctl -D /data/pgdata -l logfile start

# 创建测试数据库
/usr/local/pgsql-16/bin/createdb testdb

1.2 核心配置参数

postgresql.conf 优化配置

# ===== 连接配置 =====
listen_addresses = '*'                  # 监听所有地址
port = 5432
max_connections = 200                   # 最大连接数(建议使用连接池)
superuser_reserved_connections = 10

# ===== 内存配置 =====
shared_buffers = 16GB                   # 共享缓冲区(系统内存的 25%)
effective_cache_size = 48GB             # 操作系统缓存大小(系统内存的 75%)
work_mem = 64MB                         # 排序/哈希操作内存
maintenance_work_mem = 2GB              # 维护操作内存(VACUUM、CREATE INDEX)
max_stack_depth = 7MB

# ===== WAL 配置 =====
wal_level = replica                     # WAL 级别(minimal/replica/logical)
fsync = on                              # 强制同步(生产环境必须开启)
synchronous_commit = on                 # 同步提交(性能 vs 持久性权衡)
wal_buffers = 16MB                      # WAL 缓冲区
checkpoint_timeout = 15min              # 检查点间隔
max_wal_size = 10GB                     # 检查点触发的 WAL 量
min_wal_size = 2GB                      # 保留的最小 WAL
checkpoint_completion_target = 0.9      # 检查点完成目标(平滑 I/O)

# ===== 查询优化配置 =====
random_page_cost = 1.1                  # 随机读代价(SSD:1.1,HDD:4.0)
effective_io_concurrency = 200          # 并发 I/O 能力(SSD:200,HDD:2)
seq_page_cost = 1.0                     # 顺序读代价
cpu_tuple_cost = 0.01                   # 处理元组的 CPU 代价
cpu_index_tuple_cost = 0.005            # 处理索引元组的 CPU 代价
cpu_operator_cost = 0.0025              # 执行操作符的 CPU 代价
default_statistics_target = 100         # 统计信息采样精度

# ===== 并行查询配置 =====
max_worker_processes = 8                # 后台工作进程数
max_parallel_workers_per_gather = 4     # 单个 Gather 最大并行度
max_parallel_workers = 8                # 全局最大并行工作进程
max_parallel_maintenance_workers = 4    # 维护操作并行度
parallel_tuple_cost = 0.1               # 并行传输元组代价
parallel_setup_cost = 1000.0            # 并行启动代价
min_parallel_table_scan_size = 8MB      # 启用并行的最小表大小
min_parallel_index_scan_size = 512kB    # 启用并行的最小索引大小

# ===== Autovacuum 配置 =====
autovacuum = on                         # 启用自动 VACUUM
autovacuum_max_workers = 3              # 最大 autovacuum 工作进程
autovacuum_naptime = 1min               # autovacuum 检查间隔
autovacuum_vacuum_threshold = 50        # VACUUM 触发阈值(行数)
autovacuum_vacuum_scale_factor = 0.2    # VACUUM 触发因子(20% 修改)
autovacuum_vacuum_cost_delay = 2ms      # VACUUM 代价延迟
autovacuum_vacuum_cost_limit = 200      # VACUUM 代价限制

# ===== 日志配置 =====
logging_collector = on                  # 启用日志收集器
log_directory = 'log'                   # 日志目录
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_file_mode = 0600
log_rotation_age = 1d                   # 日志文件轮转时间
log_rotation_size = 100MB               # 日志文件轮转大小
log_min_duration_statement = 1000       # 记录超过 1 秒的查询
log_checkpoints = on                    # 记录检查点
log_connections = on                    # 记录连接
log_disconnections = on                 # 记录断开
log_lock_waits = on                     # 记录锁等待
log_temp_files = 0                      # 记录所有临时文件
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_timezone = 'UTC'

# ===== 慢查询日志配置 =====
shared_preload_libraries = 'pg_stat_statements'  # 加载慢查询统计扩展
pg_stat_statements.max = 10000          # 最多记录 10000 条 SQL
pg_stat_statements.track = all          # 跟踪所有 SQL
pg_stat_statements.track_utility = on   # 跟踪 DDL

pg_hba.conf 安全配置

# TYPE  DATABASE        USER            ADDRESS                 METHOD

# 本地连接(使用 Unix Socket)
local   all             all                                     peer

# 本地 TCP 连接
host    all             all             127.0.0.1/32            scram-sha-256
host    all             all             ::1/128                 scram-sha-256

# 远程连接(指定 IP 段)
host    all             all             192.168.1.0/24          scram-sha-256
host    all             all             10.0.0.0/8              scram-sha-256

# 复制连接
host    replication     replication     192.168.1.0/24          scram-sha-256

2. 性能优化实战

2.1 索引优化

案例 1:单列索引 vs 复合索引

场景:查询订单表,WHERE 条件包含 user_idstatus

-- 创建测试表
CREATE TABLE orders (
    id SERIAL PRIMARY KEY,
    user_id INTEGER NOT NULL,
    status VARCHAR(20) NOT NULL,
    amount NUMERIC(10,2),
    created_at TIMESTAMP DEFAULT NOW()
);

-- 插入测试数据(100 万行)
INSERT INTO orders (user_id, status, amount)
SELECT 
    (random() * 10000)::INTEGER,
    CASE (random() * 3)::INTEGER
        WHEN 0 THEN 'pending'
        WHEN 1 THEN 'paid'
        ELSE 'shipped'
    END,
    (random() * 1000)::NUMERIC(10,2)
FROM generate_series(1, 1000000);

-- 方案 1:两个单列索引
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_status ON orders(status);

-- 查询性能
EXPLAIN (ANALYZE, BUFFERS) 
SELECT * FROM orders WHERE user_id = 123 AND status = 'paid';

/*
输出(方案 1):
Bitmap Heap Scan on orders  (cost=25.67..1234.56 rows=50 width=...)
  Recheck Cond: ((user_id = 123) AND ((status)::text = 'paid'::text))
  Buffers: shared hit=150
  -> BitmapAnd  (cost=25.67..25.67 rows=50 width=0)
        Buffers: shared hit=10
        -> Bitmap Index Scan on idx_orders_user_id  (cost=0.00..10.00 rows=100 width=0)
              Index Cond: (user_id = 123)
        -> Bitmap Index Scan on idx_orders_status  (cost=0.00..15.00 rows=333333 width=0)
              Index Cond: ((status)::text = 'paid'::text)
Planning Time: 0.5 ms
Execution Time: 12.3 ms
*/

-- 方案 2:复合索引
DROP INDEX idx_orders_user_id;
DROP INDEX idx_orders_status;
CREATE INDEX idx_orders_user_status ON orders(user_id, status);

-- 查询性能
EXPLAIN (ANALYZE, BUFFERS) 
SELECT * FROM orders WHERE user_id = 123 AND status = 'paid';

/*
输出(方案 2):
Index Scan using idx_orders_user_status on orders  (cost=0.42..15.67 rows=50 width=...)
  Index Cond: ((user_id = 123) AND ((status)::text = 'paid'::text))
  Buffers: shared hit=5
Planning Time: 0.3 ms
Execution Time: 0.8 ms
*/

结论:复合索引性能提升 15 倍(12.3ms → 0.8ms),Buffer 访问减少 30 倍(150 → 5)。

最佳实践

  • 高选择性列在前(user_id 选择性高于 status)
  • 复合索引顺序:WHERE = AND = AND … → ORDER BY → SELECT
  • 避免过多索引(每个索引增加写入开销)

案例 2:部分索引(Partial Index)

场景:仅查询 status = 'pending' 的订单(占总数 10%)

-- 全表索引
CREATE INDEX idx_orders_status_full ON orders(status);
-- 索引大小:约 20MB

-- 部分索引(仅索引 pending 状态)
CREATE INDEX idx_orders_status_pending ON orders(status) WHERE status = 'pending';
-- 索引大小:约 2MB

-- 查询
EXPLAIN (ANALYZE, BUFFERS)
SELECT * FROM orders WHERE status = 'pending' ORDER BY created_at DESC LIMIT 10;

/*
使用部分索引:
Index Scan using idx_orders_status_pending on orders  (cost=0.29..10.5 rows=10 width=...)
  Filter: ((status)::text = 'pending'::text)
  Buffers: shared hit=3
Planning Time: 0.1 ms
Execution Time: 0.3 ms
*/

优势

  • 索引大小减少 90%(20MB → 2MB)
  • 查询速度提升(更少的索引页访问)
  • 降低 VACUUM 开销

2.2 查询优化

案例 3:避免隐式类型转换

-- 错误示例:字符串与整数比较
EXPLAIN ANALYZE
SELECT * FROM orders WHERE user_id = '123';  -- user_id 为 INTEGER 类型

/*
输出:
Seq Scan on orders  (cost=0.00..25000.00 rows=5000 width=...)
  Filter: ((user_id)::text = '123'::text)  -- 发生类型转换,索引失效!
  Rows Removed by Filter: 995000
Buffers: shared hit=10000
Planning Time: 0.2 ms
Execution Time: 234.5 ms
*/

-- 正确示例:类型匹配
EXPLAIN ANALYZE
SELECT * FROM orders WHERE user_id = 123;

/*
输出:
Index Scan using idx_orders_user_id on orders  (cost=0.42..8.5 rows=1 width=...)
  Index Cond: (user_id = 123)
Buffers: shared hit=4
Planning Time: 0.1 ms
Execution Time: 0.2 ms
*/

结论:类型转换导致性能下降 1000 倍以上!

案例 4:CTE 优化(WITH 子句)

PostgreSQL 12 之前,CTE 会被物化(Materialized),无法优化。PostgreSQL 12+ 支持 CTE 内联优化。

-- 低效写法(PostgreSQL < 12)
WITH recent_orders AS (
    SELECT * FROM orders WHERE created_at > NOW() - INTERVAL '7 days'
)
SELECT * FROM recent_orders WHERE user_id = 123;

/*
CTE Scan on recent_orders  (cost=25000.00..25010.00 rows=1 width=...)
  Filter: (user_id = 123)
  -> Materialize  (cost=0.00..25000.00 rows=10000 width=...)
        -> Seq Scan on orders  (cost=0.00..20000.00 rows=10000 width=...)
              Filter: (created_at > (now() - '7 days'::interval))
Planning Time: 0.5 ms
Execution Time: 150.2 ms
*/

-- 高效写法(PostgreSQL 12+):CTE 内联
WITH recent_orders AS (
    SELECT * FROM orders WHERE created_at > NOW() - INTERVAL '7 days'
)
SELECT * FROM recent_orders WHERE user_id = 123;

/*
Index Scan using idx_orders_user_created on orders  (cost=0.42..15.5 rows=1 width=...)
  Index Cond: ((user_id = 123) AND (created_at > (now() - '7 days'::interval)))
Planning Time: 0.3 ms
Execution Time: 0.5 ms
*/

-- 或直接使用子查询
SELECT * FROM orders 
WHERE created_at > NOW() - INTERVAL '7 days' 
  AND user_id = 123;

结论:CTE 内联优化提升性能 300 倍(150ms → 0.5ms)。

2.3 分区表优化

案例 5:时间范围分区

场景:日志表按月分区,查询近 7 天的数据

-- 创建分区表
CREATE TABLE logs (
    id BIGSERIAL,
    user_id INTEGER,
    action VARCHAR(50),
    created_at TIMESTAMP NOT NULL,
    data JSONB
) PARTITION BY RANGE (created_at);

-- 创建分区(2024 年各月)
CREATE TABLE logs_2024_01 PARTITION OF logs
    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
CREATE TABLE logs_2024_02 PARTITION OF logs
    FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');
-- ... 其他月份

-- 创建索引(每个分区自动继承)
CREATE INDEX idx_logs_created_at ON logs(created_at);
CREATE INDEX idx_logs_user_id ON logs(user_id);

-- 查询(自动分区剪裁)
EXPLAIN (ANALYZE, BUFFERS)
SELECT * FROM logs WHERE created_at >= '2024-01-15' AND created_at < '2024-01-22';

/*
Append  (cost=0.42..100.5 rows=1000 width=...)
  -> Index Scan using logs_2024_01_created_at_idx on logs_2024_01  (cost=0.42..100.5 rows=1000 width=...)
        Index Cond: ((created_at >= '2024-01-15'::timestamp) AND (created_at < '2024-01-22'::timestamp))
Planning Time: 0.5 ms
Execution Time: 5.2 ms
*/

优势

  • 仅扫描相关分区(logs_2024_01),避免全表扫描
  • 历史分区可移动到廉价存储(如 HDD)
  • 删除历史数据时直接 DROP PARTITION(无需 VACUUM)

最佳实践

  • 分区键选择:时间列、区域 ID 等高选择性列
  • 分区数量:建议 < 100 个(过多分区影响规划性能)
  • 定期维护:使用脚本自动创建未来分区

3. 事务与并发控制

3.1 MVCC 与事务隔离级别

案例 6:Read Committed 的不可重复读

-- 会话 1
BEGIN;
SELECT * FROM accounts WHERE id = 1;  -- balance = 1000
-- 等待会话 2 提交
SELECT * FROM accounts WHERE id = 1;  -- balance = 900(不可重复读!)
COMMIT;

-- 会话 2
BEGIN;
UPDATE accounts SET balance = balance - 100 WHERE id = 1;
COMMIT;

结论:Read Committed 级别下,同一事务内两次读取可能看到不同结果。

案例 7:Repeatable Read 的幻读防护

-- 会话 1(Repeatable Read 级别)
BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
SELECT COUNT(*) FROM orders WHERE user_id = 123;  -- 结果:10
-- 等待会话 2 提交
SELECT COUNT(*) FROM orders WHERE user_id = 123;  -- 结果:10(防止幻读!)
COMMIT;

-- 会话 2
BEGIN;
INSERT INTO orders (user_id, status, amount) VALUES (123, 'paid', 100);
COMMIT;

结论:Repeatable Read 级别下,事务看到的数据快照在事务开始时确定,不受其他事务影响。

3.2 死锁检测与预防

案例 8:死锁场景与诊断

-- 会话 1
BEGIN;
UPDATE accounts SET balance = balance - 100 WHERE id = 1;  -- 锁定 id=1
-- 等待锁 id=2
UPDATE accounts SET balance = balance + 100 WHERE id = 2;

-- 会话 2
BEGIN;
UPDATE accounts SET balance = balance - 50 WHERE id = 2;   -- 锁定 id=2
-- 等待锁 id=1(死锁!)
UPDATE accounts SET balance = balance + 50 WHERE id = 1;

/*
错误输出(会话 2):
ERROR:  deadlock detected
DETAIL:  Process 12345 waits for ShareLock on transaction 67890; blocked by process 12346.
Process 12346 waits for ShareLock on transaction 67891; blocked by process 12345.
HINT:  See server log for query details.
CONTEXT:  while updating tuple (0,1) in relation "accounts"
*/

预防策略

  1. 统一锁顺序:始终按主键递增顺序更新

    -- 正确:先锁 id=1,再锁 id=2
    UPDATE accounts SET balance = balance - 100 WHERE id IN (1, 2) ORDER BY id;
    
  2. 使用 SELECT FOR UPDATE:显式加锁

    BEGIN;
    SELECT * FROM accounts WHERE id IN (1, 2) ORDER BY id FOR UPDATE;
    UPDATE accounts SET balance = balance - 100 WHERE id = 1;
    UPDATE accounts SET balance = balance + 100 WHERE id = 2;
    COMMIT;
    
  3. 调整 deadlock_timeout

    SET deadlock_timeout = '100ms';  -- 降低死锁检测延迟
    

3.3 长事务优化

案例 9:长事务的危害

-- 长事务示例
BEGIN;
SELECT pg_backend_pid();  -- 获取进程 PID
-- 长时间空闲(如等待用户输入)
-- ... 10 分钟后 ...
UPDATE accounts SET balance = balance - 100 WHERE id = 1;
COMMIT;

危害

  • 阻塞 VACUUM:死元组无法清理,表膨胀
  • 连接占用:长期持有连接,耗尽连接池
  • 锁等待:其他事务等待锁,影响并发

诊断

-- 查看长事务
SELECT pid, now() - xact_start AS duration, state, query
FROM pg_stat_activity
WHERE state != 'idle'
  AND xact_start IS NOT NULL
ORDER BY duration DESC
LIMIT 10;

-- 终止长事务
SELECT pg_terminate_backend(12345);  -- 12345 为 PID

最佳实践

  • 事务尽可能短小(< 1 秒)
  • 避免在事务中执行耗时操作(如外部 API 调用)
  • 配置 idle_in_transaction_session_timeout
    SET idle_in_transaction_session_timeout = '5min';
    

4. 高可用与复制

4.1 流复制配置

主库配置

# postgresql.conf
wal_level = replica
max_wal_senders = 5
wal_keep_size = 1GB  # 保留 1GB WAL(防止备库延迟过大)
archive_mode = on
archive_command = 'test ! -f /archive/%f && cp %p /archive/%f'
# pg_hba.conf
host    replication     replication     192.168.1.0/24    scram-sha-256

备库配置

# 备库初始化(使用 pg_basebackup)
pg_basebackup -h 192.168.1.10 -U replication -D /data/pgdata_standby \
              -X stream -P -R

# -X stream: 流式接收 WAL
# -P: 显示进度
# -R: 自动生成 standby.signal 和 recovery 配置

standby.signal(备库标识文件)

# 空文件,表示该实例为备库

postgresql.auto.conf(备库配置)

primary_conninfo = 'host=192.168.1.10 port=5432 user=replication password=xxx'
restore_command = 'cp /archive/%f %p'

启动备库

pg_ctl -D /data/pgdata_standby start

验证复制状态

主库:

-- 查看 WAL Sender 状态
SELECT * FROM pg_stat_replication;

/*
pid  | usename      | application_name | state     | sync_state | replay_lag
-----|--------------|------------------|-----------|------------|------------
1234 | replication  | walreceiver      | streaming | async      | 00:00:00.1
*/

备库:

-- 查看复制延迟
SELECT now() - pg_last_xact_replay_timestamp() AS replication_lag;

/*
replication_lag
-----------------
00:00:00.234
*/

4.2 同步复制与故障切换

同步复制配置

主库:

synchronous_standby_names = 'standby1'  # 同步备库名称
synchronous_commit = on

备库:

cluster_name = 'standby1'  # 备库名称

故障切换(Failover)

  1. 检测主库故障

    pg_isready -h 192.168.1.10
    # 输出:no response
    
  2. 提升备库为主库

    # 在备库执行
    pg_ctl promote -D /data/pgdata_standby
    
    # 或使用 pg_ctl 命令文件
    pg_ctl -D /data/pgdata_standby promote
    
  3. 验证备库已成为主库

    SELECT pg_is_in_recovery();
    -- 输出:f(false,表示已是主库)
    
  4. 应用切换连接: 修改应用配置,连接到新主库(192.168.1.11)

自动故障切换(使用 Patroni)

Patroni 是基于 etcd/Consul/ZooKeeper 的高可用方案,支持自动故障检测与切换。

# patroni.yml
scope: postgres-cluster
name: node1
restapi:
  listen: 0.0.0.0:8008
  connect_address: 192.168.1.10:8008
etcd:
  host: 192.168.1.100:2379
bootstrap:
  dcs:
    ttl: 30
    loop_wait: 10
    retry_timeout: 10
    maximum_lag_on_failover: 1048576
    postgresql:
      use_pg_rewind: true
      parameters:
        wal_level: replica
        hot_standby: on
        max_wal_senders: 5
        synchronous_commit: on

5. 监控与诊断

5.1 关键性能指标

实时活动监控

-- 当前活动会话
SELECT pid, usename, application_name, state, 
       now() - query_start AS query_duration,
       substring(query, 1, 50) AS query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY query_duration DESC;

-- 锁等待
SELECT blocked_locks.pid AS blocked_pid,
       blocked_activity.usename AS blocked_user,
       blocking_locks.pid AS blocking_pid,
       blocking_activity.usename AS blocking_user,
       blocked_activity.query AS blocked_statement,
       blocking_activity.query AS blocking_statement
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks 
    ON blocking_locks.locktype = blocked_locks.locktype
    AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
    AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
    AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
    AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
    AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
    AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
    AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
    AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
    AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;

慢查询分析

-- 启用 pg_stat_statements 扩展
CREATE EXTENSION pg_stat_statements;

-- 查询最慢的 10 条 SQL
SELECT query, 
       calls,
       total_exec_time / 1000 AS total_time_sec,
       mean_exec_time / 1000 AS mean_time_sec,
       max_exec_time / 1000 AS max_time_sec,
       stddev_exec_time / 1000 AS stddev_time_sec,
       rows
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;

-- 重置统计信息
SELECT pg_stat_statements_reset();

缓冲区命中率

-- 表缓冲区命中率(应 > 99%)
SELECT 
    schemaname,
    relname,
    heap_blks_read,
    heap_blks_hit,
    CASE WHEN (heap_blks_hit + heap_blks_read) = 0 THEN 0
         ELSE heap_blks_hit * 100.0 / (heap_blks_hit + heap_blks_read)
    END AS hit_ratio
FROM pg_statio_user_tables
ORDER BY heap_blks_read DESC
LIMIT 10;

-- 索引缓冲区命中率
SELECT 
    schemaname,
    relname,
    indexrelname,
    idx_blks_read,
    idx_blks_hit,
    CASE WHEN (idx_blks_hit + idx_blks_read) = 0 THEN 0
         ELSE idx_blks_hit * 100.0 / (idx_blks_hit + idx_blks_read)
    END AS hit_ratio
FROM pg_statio_user_indexes
ORDER BY idx_blks_read DESC
LIMIT 10;

表膨胀检测

-- 检测表膨胀(死元组占比)
SELECT 
    schemaname,
    relname,
    n_live_tup AS live_tuples,
    n_dead_tup AS dead_tuples,
    CASE WHEN n_live_tup = 0 THEN 0
         ELSE n_dead_tup * 100.0 / n_live_tup
    END AS dead_ratio,
    last_autovacuum,
    last_autoanalyze
FROM pg_stat_user_tables
WHERE n_dead_tup > 1000
ORDER BY dead_ratio DESC
LIMIT 10;

-- 手动 VACUUM
VACUUM (VERBOSE, ANALYZE) orders;

5.2 日志分析

使用 pgBadger 分析日志

# 安装 pgBadger
sudo apt-get install pgbadger

# 分析日志
pgbadger /var/log/postgresql/postgresql-2024-01-15.log -o report.html

# 在浏览器打开 report.html
# 包含:慢查询 TOP 10、错误统计、会话统计、锁等待等

实时监控工具

  1. pg_top:类似 top 的 PostgreSQL 监控工具

    pg_top -h localhost -U postgres -d testdb
    
  2. pgAdmin:图形化管理工具

  3. Grafana + Prometheus + postgres_exporter:监控仪表盘


6. 备份与恢复

6.1 物理备份(pg_basebackup)

# 全量备份
pg_basebackup -h localhost -U postgres -D /backup/pgdata_$(date +%Y%m%d) \
              -Ft -z -Xs -P

# -Ft: tar 格式
# -z: 压缩
# -Xs: 流式传输 WAL
# -P: 显示进度

6.2 逻辑备份(pg_dump)

# 备份单个数据库
pg_dump -h localhost -U postgres -Fc -f testdb_$(date +%Y%m%d).dump testdb

# -Fc: 自定义压缩格式

# 备份所有数据库
pg_dumpall -h localhost -U postgres -f all_dbs_$(date +%Y%m%d).sql

6.3 时间点恢复(PITR)

配置 WAL 归档

# postgresql.conf
wal_level = replica
archive_mode = on
archive_command = 'test ! -f /archive/%f && cp %p /archive/%f'

恢复步骤

  1. 停止数据库
  2. 清空数据目录
  3. 恢复基础备份
  4. 配置恢复目标时间
  5. 启动数据库
# 1. 停止数据库
pg_ctl stop -D /data/pgdata

# 2. 清空数据目录
rm -rf /data/pgdata/*

# 3. 恢复基础备份
tar -xzf /backup/pgdata_20240115.tar.gz -C /data/pgdata

# 4. 配置恢复
cat > /data/pgdata/recovery.signal << EOF
restore_command = 'cp /archive/%f %p'
recovery_target_time = '2024-01-15 14:30:00'
recovery_target_action = 'promote'
EOF

# 5. 启动数据库
pg_ctl start -D /data/pgdata

# 数据库会自动应用 WAL 至指定时间点

7. 安全最佳实践

7.1 用户与权限管理

-- 创建只读用户
CREATE ROLE readonly_user LOGIN PASSWORD 'secure_password';

-- 授予连接权限
GRANT CONNECT ON DATABASE testdb TO readonly_user;

-- 授予 SELECT 权限
GRANT USAGE ON SCHEMA public TO readonly_user;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO readonly_user;

-- 自动授予新表的 SELECT 权限
ALTER DEFAULT PRIVILEGES IN SCHEMA public 
GRANT SELECT ON TABLES TO readonly_user;

-- 创建应用用户(读写权限)
CREATE ROLE app_user LOGIN PASSWORD 'secure_password';
GRANT CONNECT ON DATABASE testdb TO app_user;
GRANT USAGE, CREATE ON SCHEMA public TO app_user;
GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO app_user;
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO app_user;

7.2 SSL 连接

服务端配置

# postgresql.conf
ssl = on
ssl_cert_file = '/etc/postgresql/server.crt'
ssl_key_file = '/etc/postgresql/server.key'
ssl_ca_file = '/etc/postgresql/ca.crt'

客户端连接

psql "host=192.168.1.10 dbname=testdb user=postgres sslmode=verify-full"

7.3 行级安全策略(RLS)

-- 启用 RLS
ALTER TABLE orders ENABLE ROW LEVEL SECURITY;

-- 创建策略:用户只能查看自己的订单
CREATE POLICY orders_user_policy ON orders
FOR SELECT
USING (user_id = current_setting('app.user_id')::INTEGER);

-- 应用设置用户 ID
SET app.user_id = 123;

-- 查询(自动过滤)
SELECT * FROM orders;  -- 仅返回 user_id = 123 的订单

总结

本文档涵盖了 PostgreSQL 在实际生产环境中的核心实践,包括:

  • 环境搭建与配置优化
  • 性能调优(索引、查询、分区)
  • 事务与并发控制
  • 高可用与复制
  • 监控与诊断
  • 备份与恢复
  • 安全最佳实践

通过这些案例和最佳实践,开发者和 DBA 可以快速掌握 PostgreSQL 的高效使用方式,构建稳定、高性能的数据库系统。