Matarialize中文社区-索引设计与优化策略

索引设计与优化策略

落叶 2026-05-02 20:38:17

4 索引,PostgreSQL 专栏

一、索引基础原理

1. 索引的本质

索引是数据库中用于加速数据检索的数据结构，类似于书籍的目录。PostgreSQL 提供了多种索引类型，每种都有其特定的使用场景。

-- 查看表是否有索引
SELECT 
    tablename,
    indexname,
    indexdef
FROM pg_indexes 
WHERE tablename = 'users';

-- 查看索引使用情况统计
SELECT 
    schemaname,
    tablename,
    indexname,
    idx_scan AS times_used,
    idx_tup_read AS tuples_read,
    idx_tup_fetch AS tuples_fetched
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;

2. 索引的代价

-- 索引的代价体现在三个方面：
-- 1. 存储空间：索引通常占用表大小的 10%-50%
-- 2. 写入性能：INSERT/UPDATE/DELETE 需要同步更新索引
-- 3. 维护成本：VACUUM 和 ANALYZE 需要处理索引

-- 查看索引占用空间
SELECT 
    tablename,
    indexname,
    pg_size_pretty(pg_relation_size(indexname::regclass)) AS index_size
FROM pg_indexes
WHERE tablename = 'large_table'
ORDER BY pg_relation_size(indexname::regclass) DESC;

二、索引类型详解

1. B-Tree 索引（默认）

B-Tree 是最通用的索引类型，适用于等值查询和范围查询。

-- 创建 B-Tree 索引
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_users_age ON users(age);

-- 适用的查询模式
-- 1. 等值查询
SELECT * FROM users WHERE email = 'user@example.com';

-- 2. 范围查询
SELECT * FROM users WHERE age BETWEEN 20 AND 30;
SELECT * FROM users WHERE age > 18;
SELECT * FROM users WHERE created_at > '2024-01-01';

-- 3. 排序查询（避免单独排序）
SELECT * FROM users ORDER BY email LIMIT 10;

-- 4. 前缀模糊匹配（仅限左匹配）
SELECT * FROM users WHERE email LIKE 'user@%';

-- B-Tree 索引与排序方向
CREATE INDEX idx_users_age_email ON users(age ASC, email DESC);

2. Hash 索引

Hash 索引仅支持等值查询，但理论上比 B-Tree 更快。

-- 创建 Hash 索引（PostgreSQL 10+ 已支持 WAL 日志）
CREATE INDEX idx_users_phone_hash ON users USING hash(phone);

-- 适用场景：非常明确的等值查询
SELECT * FROM users WHERE phone = '13800138000';

-- Hash 索引限制
-- ❌ 不支持范围查询
-- ❌ 不支持排序
-- ❌ 不支持部分索引
-- ❌ 不支持多列索引

-- 查看 Hash 索引是否被使用
EXPLAIN SELECT * FROM users WHERE phone = '13800138000';

3. GiST 索引

GiST（Generalized Search Tree）是一种平衡树结构，适用于复杂数据类型。

-- 创建 GiST 索引
CREATE EXTENSION IF NOT EXISTS btree_gist;

-- 1. 范围类型索引
CREATE INDEX idx_reservations_time ON reservations USING gist(time_range);

-- 查询重叠的时间段
SELECT * FROM reservations 
WHERE time_range && tstzrange('2024-06-01 09:00', '2024-06-01 17:00');

-- 2. 几何类型索引
CREATE INDEX idx_locations_point ON locations USING gist(geo_point);

-- 查询附近的地点
SELECT * FROM locations
WHERE geo_point <-> point(40.7128, -74.0060) < 10000;  -- 10公里内

-- 3. 全文搜索（通常用 GIN，GiST 也支持）
CREATE INDEX idx_articles_search_gist ON articles USING gist(search_vector);

-- 4. 数组重叠查询（需要 btree_gist 扩展）
CREATE INDEX idx_tags_gist ON articles USING gist(tags);

SELECT * FROM articles WHERE tags && ARRAY['postgresql', 'database'];

4. GIN 索引

GIN（Generalized Inverted Index）是倒排索引，适用于包含多个值的复合数据类型。

-- GIN 索引适用场景
-- 1. JSONB 查询
CREATE INDEX idx_products_data ON products USING gin(product_data);

SELECT * FROM products WHERE product_data @> '{"category": "Electronics"}';
SELECT * FROM products WHERE product_data ? 'warranty';

-- 2. 数组查询
CREATE INDEX idx_articles_tags ON articles USING gin(tags);

SELECT * FROM articles WHERE tags @> ARRAY['postgresql', 'database'];
SELECT * FROM articles WHERE tags && ARRAY['postgresql'];

-- 3. 全文搜索
CREATE INDEX idx_articles_search ON articles USING gin(search_vector);

SELECT * FROM articles 
WHERE search_vector @@ to_tsquery('english', 'postgresql & performance');

-- 4. 文本相似度（pg_trgm）
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE INDEX idx_users_name_trgm ON users USING gin(name gin_trgm_ops);

SELECT * FROM users WHERE name % 'Jonh';  -- 模糊匹配

-- GIN 索引优化参数
CREATE INDEX idx_gin_optimized ON large_table USING gin(data)
WITH (fastupdate = on, gin_pending_list_limit = 4096);

5. BRIN 索引

BRIN（Block Range Index）是一种块范围索引，适用于超大表中自然排序的列。

-- BRIN 索引特点：极小、但可能扫描更多块
-- 适用条件：数据物理顺序与逻辑顺序一致

-- 创建 BRIN 索引
CREATE INDEX idx_logs_created_at ON logs USING brin(created_at);

-- 每页块数量（pages_per_range）
CREATE INDEX idx_brin_custom ON huge_table USING brin(id)
WITH (pages_per_range = 128);  -- 默认 128

-- 查看 BRIN 索引摘要
SELECT 
    brinrelid::regclass AS table_name,
    brinindexid::regclass AS index_name,
    brinallvisible
FROM pg_brin_index_stats;

-- BRIN 最佳实践
-- ✅ 适用：时序数据（日志、监控）、自增 ID
-- ❌ 不适用：随机分布的数据、频繁更新的列

-- 维护 BRIN 索引
VACUUM huge_table;  -- 更新 BRIN 摘要信息

6. SP-GiST 索引

SP-GiST（Space-Partitioned GiST）适用于空间分区的数据。

-- SP-GiST 适用场景：点、线、圆等空间数据
CREATE INDEX idx_points_spgist ON points USING spgist(geom);

-- 相比 GiST，SP-GiST 在某些空间查询中性能更好
-- 适用于四叉树、k-d 树等空间分区算法

7. 部分索引

部分索引只索引表的一部分行，极大地减少了索引大小。

-- 只索引活跃用户
CREATE INDEX idx_active_users_email ON users(email) 
WHERE status = 'active';

-- 查询会自动使用部分索引
SELECT * FROM users WHERE status = 'active' AND email = 'alice@example.com';

-- 部分索引的好处
-- 1. 索引更小
-- 2. 维护成本更低
-- 3. 查询更快

-- 多条件部分索引
CREATE INDEX idx_high_value_orders ON orders(created_at)
WHERE total_amount > 10000;

-- 排除 NULL 值的部分索引
CREATE INDEX idx_users_phone_notnull ON users(phone)
WHERE phone IS NOT NULL;

8. 表达式索引

表达式索引基于函数或表达式创建，而非直接基于列。

-- 基于函数的索引
CREATE INDEX idx_users_lower_email ON users(LOWER(email));

-- 查询时使用相同的表达式
SELECT * FROM users WHERE LOWER(email) = 'alice@example.com';

-- 基于 JSONB 字段的索引
CREATE INDEX idx_products_price ON products(((product_data->>'price')::NUMERIC));

SELECT * FROM products 
WHERE (product_data->>'price')::NUMERIC > 100;

-- 基于日期函数的索引
CREATE INDEX idx_orders_year_month ON orders(EXTRACT(YEAR FROM created_at), EXTRACT(MONTH FROM created_at));

SELECT * FROM orders 
WHERE EXTRACT(YEAR FROM created_at) = 2024 
  AND EXTRACT(MONTH FROM created_at) = 6;

-- 多个表达式的复合索引
CREATE INDEX idx_users_name_age ON users(LOWER(name), age);

9. 覆盖索引（INCLUDE）

覆盖索引在索引中额外存储非搜索列，实现索引只读查询。

-- 覆盖索引语法（PostgreSQL 11+）
CREATE INDEX idx_users_covering ON users(id) INCLUDE (name, email, status);

-- 查询可以完全不访问表
SELECT id, name, email, status FROM users WHERE id = 1;
-- 所有数据都从索引获取

-- 查看是否为覆盖索引
EXPLAIN (ANALYZE, BUFFERS) 
SELECT id, name, email FROM users WHERE id = 1;
-- 应该显示 "Heap Fetches: 0"

-- 覆盖索引的最佳实践
-- ✅ 常用的大表查询
-- ✅ 高并发只读查询
-- ❌ 频繁更新的列
-- ❌ 太大的列（TEXT、JSONB、BYTEA）

三、复合索引设计

1. 列顺序原则

-- 复合索引的列顺序至关重要
CREATE INDEX idx_composite ON orders(user_id, status, created_at);

-- 列顺序决策原则：
-- 1. 等值条件列优先
-- 2. 范围条件列次之
-- 3. 排序列最后

-- ✅ 能使用的查询
SELECT * FROM orders WHERE user_id = 1;                           -- 使用第一列
SELECT * FROM orders WHERE user_id = 1 AND status = 'active';      -- 使用前两列
SELECT * FROM orders WHERE user_id = 1 AND status = 'active' 
  AND created_at > '2024-01-01';                                   -- 使用全部

-- ❌ 不能使用的查询
SELECT * FROM orders WHERE status = 'active';                      -- 不使用索引
SELECT * FROM orders WHERE created_at > '2024-01-01';              -- 不使用索引

2. 选择性与索引效率

-- 选择性 = 不同值数量 / 总行数
-- 选择性越高（接近 1），索引效率越好

-- 查看列的选择性
SELECT 
    COUNT(DISTINCT email) AS distinct_count,
    COUNT(*) AS total_count,
    ROUND(COUNT(DISTINCT email) * 100.0 / COUNT(*), 2) AS selectivity_pct
FROM users;

-- 低选择性列不合适单独作为索引
-- 示例：status 列只有几个值（active, pending, deleted）
-- 选择性很低，B-Tree 索引效果差

-- 低选择性列的替代方案
-- 1. 部分索引
CREATE INDEX idx_active_users ON users(id) WHERE status = 'active';

-- 2. 复合索引（放在后面）
CREATE INDEX idx_users_status_created ON users(status, created_at);

-- 3. BRIN 索引（如果数据有序）
CREATE INDEX idx_users_status_brin ON users USING brin(status);

3. 复合索引设计模式

-- 模式1：等值+范围
CREATE INDEX idx_user_created ON orders(user_id, created_at);
-- 适用于：特定用户的时间范围查询

-- 模式2：多等值+排序
CREATE INDEX idx_user_status_date ON orders(user_id, status, created_at DESC);
-- 适用于：特定用户的特定状态，按时间倒序

-- 模式3：覆盖索引
CREATE INDEX idx_order_covering ON orders(user_id, status, created_at) 
INCLUDE (total_amount, order_number);
-- 适用于：高频查询，避免回表

-- 模式4：部分复合索引
CREATE INDEX idx_active_user_orders ON orders(user_id, created_at)
WHERE status != 'cancelled';

四、索引使用分析

1. 检查索引是否生效

-- 使用 EXPLAIN 分析索引使用情况
EXPLAIN (ANALYZE, BUFFERS, VERBOSE)
SELECT * FROM users WHERE email = 'alice@example.com';

-- 关键输出解读：
-- Index Scan: 使用了索引
-- Seq Scan: 全表扫描（可能需要优化）
-- Bitmap Index Scan + Bitmap Heap Scan: 组合索引扫描
-- Index Only Scan: 覆盖索引，无需回表

-- 查看索引扫描效率
SELECT 
    relname AS table_name,
    indexrelname AS index_name,
    idx_scan AS index_scans,
    idx_tup_read AS tuples_read,
    idx_tup_fetch AS tuples_fetched
FROM pg_stat_user_indexes
WHERE idx_scan > 0
ORDER BY idx_tup_fetch / NULLIF(idx_scan, 0) DESC;

2. 未使用索引检测

-- 查找从未使用的索引
SELECT 
    schemaname,
    tablename,
    indexname,
    idx_scan,
    pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
ORDER BY pg_relation_size(indexname::regclass) DESC;

-- 查找重复索引
SELECT 
    pg_get_indexdef(indexrelid) AS index_definition,
    array_agg(indexrelid::regclass) AS duplicate_indexes
FROM pg_index
WHERE indrelid = 'users'::regclass
GROUP BY pg_get_indexdef(indexrelid)
HAVING COUNT(*) > 1;

-- 查找相似索引（部分重合）
SELECT 
    a.indexrelid::regclass AS index1,
    b.indexrelid::regclass AS index2,
    a.indkey AS columns1,
    b.indkey AS columns2
FROM pg_index a, pg_index b
WHERE a.indrelid = b.indrelid 
  AND a.indexrelid < b.indexrelid
  AND a.indkey @> b.indkey;

3. 索引命中率分析

-- 计算索引命中率
SELECT 
    schemaname,
    tablename,
    idx_scan,
    seq_scan,
    ROUND(100.0 * idx_scan / NULLIF(idx_scan + seq_scan, 0), 2) AS index_usage_pct
FROM pg_stat_user_tables
WHERE idx_scan + seq_scan > 0
ORDER BY index_usage_pct;

-- 缓存命中率（与索引相关）
SELECT 
    'index hit rate' AS name,
    SUM(idx_blks_hit) * 100.0 / NULLIF(SUM(idx_blks_hit) + SUM(idx_blks_read), 0) AS hit_ratio
FROM pg_statio_user_indexes;

五、索引维护与优化

1. 索引维护操作

-- 重建索引（消除膨胀）
REINDEX INDEX idx_users_email;
REINDEX TABLE users;  -- 重建表上所有索引
REINDEX DATABASE mydb;  -- 重建整个数据库的索引

-- 在线重建（不阻塞写入）
REINDEX INDEX CONCURRENTLY idx_users_email;

-- 分析索引统计信息
ANALYZE VERBOSE users;

-- 重置索引统计
SELECT pg_stat_reset();
SELECT pg_stat_reset_single_table_counters('users'::regclass);

-- 设置索引的填充因子（减少页分裂）
CREATE INDEX idx_fillfactor ON large_table(id) WITH (fillfactor = 90);
ALTER INDEX idx_fillfactor SET (fillfactor = 85);

2. 索引膨胀监控

-- 检测索引膨胀
CREATE EXTENSION IF NOT EXISTS pgstattuple;

SELECT 
    indexrelid::regclass AS index_name,
    pg_size_pretty(pg_relation_size(indexrelid)) AS size,
    version,
    leaf_fragmentation AS fragmentation_pct,
    dead_tuple_count AS dead_tuples
FROM pgstatindex('idx_users_email');

-- 查找需要重建的索引（膨胀率 > 20%）
SELECT 
    schemaname,
    tablename,
    indexname,
    pg_size_pretty(pg_relation_size(indexname::regclass)) AS size,
    idx_scan,
    idx_tup_read,
    idx_tup_fetch
FROM pg_stat_user_indexes
WHERE pg_relation_size(indexname::regclass) > 10 * 1024 * 1024  -- > 10MB
  AND idx_scan < 100  -- 使用较少
ORDER BY pg_relation_size(indexname::regclass) DESC;

3. 自动索引推荐

-- PostgreSQL 自身不提供自动索引推荐，但可以手动分析

-- 1. 分析慢查询中的过滤条件
-- 查看 pg_stat_statements 中最耗时的查询
SELECT 
    query,
    calls,
    mean_exec_time,
    stddev_exec_time,
    rows
FROM pg_stat_statements
WHERE query NOT LIKE '%pg_%'
ORDER BY mean_exec_time DESC
LIMIT 20;

-- 2. 分析缺少索引的表
SELECT 
    schemaname,
    tablename,
    seq_scan,
    seq_tup_read,
    idx_scan,
    ROUND(seq_tup_read / NULLIF(seq_scan, 0)) AS avg_rows_per_seq_scan
FROM pg_stat_user_tables
WHERE seq_scan > 100
ORDER BY seq_scan DESC;

-- 3. 分析 WHERE 条件中的列
-- 需要开启 track_activities 和 track_counts
SELECT 
    schemaname,
    tablename,
    attname,
    n_distinct,
    correlation
FROM pg_stats
WHERE tablename = 'orders'
ORDER BY correlation;

六、特殊场景索引策略

1. 分页查询优化

-- 传统分页（OFFSET 效率低）
SELECT * FROM users ORDER BY id LIMIT 10 OFFSET 100000;
-- OFFSET 会扫描前 100010 行

-- 优化方案1：游标分页（Seek Method）
SELECT * FROM users 
WHERE id > 100000 
ORDER BY id LIMIT 10;

-- 优化方案2：延迟关联
SELECT u.* 
FROM users u
INNER JOIN (
    SELECT id FROM users 
    ORDER BY id 
    LIMIT 10 OFFSET 100000
) AS sub ON u.id = sub.id;

-- 需要对应的索引
CREATE INDEX idx_users_id ON users(id);

-- 时间戳分页
SELECT * FROM events 
WHERE created_at < '2024-01-01 00:00:00' 
ORDER BY created_at DESC LIMIT 20;
-- 需要索引：CREATE INDEX idx_events_created ON events(created_at);

2. 排序查询优化

-- 避免文件排序（filesort）
-- 创建与 ORDER BY 匹配的索引

-- ✅ 索引顺序与排序一致
CREATE INDEX idx_created_at ON orders(created_at);
SELECT * FROM orders ORDER BY created_at;

-- ✅ 多列排序
CREATE INDEX idx_user_date ON orders(user_id, created_at DESC);
SELECT * FROM orders WHERE user_id = 1 ORDER BY created_at DESC;

-- ❌ 无法使用索引的排序
CREATE INDEX idx_date ON orders(created_at);
SELECT * FROM orders ORDER BY created_at DESC;  -- 可以使用反向扫描
SELECT * FROM orders ORDER BY created_at NULLS LAST;  -- 需要匹配 NULLS 选项

-- 表达式排序
CREATE INDEX idx_lower_name ON users(LOWER(name));
SELECT * FROM users ORDER BY LOWER(name);

3. 全文搜索索引优化

-- 全文搜索索引参数调优
CREATE INDEX idx_articles_search ON articles USING gin(search_vector)
WITH (fastupdate = off, gin_pending_list_limit = 8192);

-- 字典优化
CREATE TEXT SEARCH CONFIGURATION custom_english (COPY = english);
ALTER TEXT SEARCH CONFIGURATION custom_english
    ALTER MAPPING FOR word, asciiword WITH english_stem, thesaurus_english;

-- 权重优化
CREATE INDEX idx_weighted_search ON articles USING gin(
    setweight(to_tsvector('english', title), 'A') ||
    setweight(to_tsvector('english', content), 'B')
);

-- tsvector 预计算（生成列）
ALTER TABLE articles ADD COLUMN search_vector tsvector
GENERATED ALWAYS AS (
    setweight(to_tsvector('english', title), 'A') ||
    setweight(to_tsvector('english', content), 'B')
) STORED;

4. JSONB 索引优化

-- JSONB 索引策略
-- 1. 默认 GIN 索引
CREATE INDEX idx_jsonb_gin ON products USING gin(product_data);

-- 2. 指定操作符的 GIN 索引（更小更快）
CREATE INDEX idx_jsonb_gin_ops ON products USING gin(product_data jsonb_path_ops);

-- 3. 针对特定路径的索引
CREATE INDEX idx_product_category ON products((product_data->>'category'));
CREATE INDEX idx_product_price ON products(((product_data->>'price')::NUMERIC));

-- 4. 表达式索引
CREATE INDEX idx_product_category_price ON products(
    (product_data->>'category'),
    ((product_data->>'price')::NUMERIC)
);

-- JSONB 索引使用示例
EXPLAIN SELECT * FROM products 
WHERE product_data @> '{"category": "Electronics", "price": 500}';

-- 使用 jsonb_path_ops 的查询
SELECT * FROM products 
WHERE product_data @@ '$.category == "Electronics"';

七、索引设计最佳实践

1. 索引设计检查清单

-- ✅ 应该创建索引的场景
-- 1. WHERE 条件中频繁使用的列
-- 2. JOIN 的关联列（外键）
-- 3. ORDER BY 和 GROUP BY 的列
-- 4. 唯一性约束的列
-- 5. 覆盖索引用于高频查询

-- ❌ 不建议创建索引的场景
-- 1. 小表（< 1000 行）
-- 2. 频繁更新的列
-- 3. 低选择性的列（如 gender、status）
-- 4. 很少使用的查询
-- 5. BLOB/TEXT 长字段（除非使用全文搜索）

-- 索引创建决策矩阵
SELECT 
    '是否需要索引' AS question,
    CASE 
        WHEN seq_scan > 1000 AND seq_tup_read / seq_scan > 100 THEN '是'
        WHEN idx_scan > 0 AND idx_tup_fetch / idx_scan < 10 THEN '可能不需要'
        ELSE '分析具体查询'
    END AS suggestion
FROM pg_stat_user_tables
WHERE tablename = 'your_table';

2. 索引命名规范

-- 推荐命名规范
-- 普通索引：idx_表名_列名
CREATE INDEX idx_users_email ON users(email);

-- 唯一索引：uq_表名_列名
CREATE UNIQUE INDEX uq_users_email ON users(email);

-- 复合索引：idx_表名_列1_列2
CREATE INDEX idx_orders_user_status ON orders(user_id, status);

-- 表达式索引：idx_表名_表达式
CREATE INDEX idx_users_lower_email ON users(LOWER(email));

-- 部分索引：idx_表名_列名_条件
CREATE INDEX idx_active_users_email ON users(email) WHERE status = 'active';

3. 索引数量控制

-- 每个表的索引数量建议
-- 小表（< 10万行）：最多 3-5 个索引
-- 中表（10万-1000万行）：最多 5-8 个索引
-- 大表（>1000万行）：最多 8-12 个索引

-- 查看表的索引数量
SELECT 
    tablename,
    COUNT(*) AS index_count,
    SUM(pg_relation_size(indexname::regclass)) AS total_index_size
FROM pg_indexes
GROUP BY tablename
ORDER BY index_count DESC;

-- 索引维护成本评估
SELECT 
    idx_scan AS reads,
    pg_relation_size(indexrelid) AS size,
    idx_tup_fetch AS rows_fetched,
    idx_scan + idx_tup_fetch AS total_work
FROM pg_stat_user_indexes;

4. 生产环境索引变更流程

-- 安全的索引变更流程（零停机时间）

-- 步骤1：创建索引（CONCURRENTLY 不阻塞写入）
CREATE INDEX CONCURRENTLY idx_new_column ON large_table(new_column);

-- 步骤2：验证索引
SELECT * FROM pg_indexes WHERE indexname = 'idx_new_column';

-- 步骤3：测试查询性能
EXPLAIN (ANALYZE, BUFFERS) 
SELECT * FROM large_table WHERE new_column = 'value';

-- 步骤4：（可选）删除旧索引
DROP INDEX CONCURRENTLY IF EXISTS idx_old_column;

-- 批量索引创建
DO $$
DECLARE
    idx_name TEXT;
BEGIN
    FOR idx_name IN 
        SELECT indexname FROM pg_indexes 
        WHERE tablename = 'large_table' AND indexname NOT LIKE '%_concurrently'
    LOOP
        EXECUTE format('DROP INDEX CONCURRENTLY %I', idx_name);
        EXECUTE format('CREATE INDEX CONCURRENTLY %I ON large_table USING btree(new_column)', idx_name);
    END LOOP;
END;
$$ LANGUAGE plpgsql;

评论区