[深度]OpenClaw高可用生产部署:Redis集群 + MySQL主从 + Nginx负载均衡全攻略

阿里云推广

OpenClaw高可用生产环境部署完全指南

开发环境跑起来只需要Docker Compose,但生产环境需要高可用架构。本文给出OpenClaw从单机到高可用的完整部署方案,覆盖Redis集群、MySQL主从复制、Nginx负载均衡三个核心组件。

一、生产环境架构全景

# 高可用架构示意 (最小化生产配置, 3台服务器)
#
# 公网 ──► Nginx(主备) ──► OpenClaw-API x3
#                              │         │
#                    MySQL主库  │  Redis集群(3主3从)
#                    MySQL从库  │
#                              │
#                    Kafka集群(3节点)
#
# 服务器规格建议:
# API节点: 4核8G x3  (无状态,可水平扩展)
# DB节点:  8核32G x2  (主从各一台)
# Redis节点: 4核16G x3 (集群模式)
# Kafka节点: 4核16G x3 (Broker)
#
# 最低成本方案(云服务器):
# 阿里云 4核8G ECS x3 ≈ 1200元/月
# RDS MySQL 8核32G ≈ 800元/月
# Redis企业版 ≈ 300元/月

二、Redis Cluster 部署配置

# 6节点Redis Cluster配置 (3主3从)
# /etc/redis/redis-7001.conf

port 7001
cluster-enabled yes
cluster-config-file nodes-7001.conf
cluster-node-timeout 15000
appendonly yes
appendfsync everysec

# 同一台机器启动多个实例用于测试
redis-server /etc/redis/redis-7001.conf --daemonize yes
redis-server /etc/redis/redis-7002.conf --daemonize yes
redis-server /etc/redis/redis-7003.conf --daemonize yes

# 另外3台机器分别启动7004,7005,7006

# 创建集群 (--cluster-replicas 1 表示每主一从)
redis-cli --cluster create \
    192.168.1.1:7001 192.168.1.1:7002 192.168.1.1:7003 \
    192.168.1.2:7004 192.168.1.2:7005 192.168.1.2:7006 \
    --cluster-replicas 1 --cluster-yes

# 验证集群状态
redis-cli -p 7001 cluster info | grep cluster_state
# cluster_state:ok

redis-cli -p 7001 cluster nodes
# 显示6个节点,3主3从

三、MySQL主从复制配置

# 主库配置 /etc/mysql/conf.d/primary.cnf
[mysqld]
server-id = 1
log-bin = mysql-bin
binlog-format = ROW
binlog-do-db = openclaw
sync_binlog = 1              # 每次事务同步binlog,保证不丢数据
innodb_flush_log_at_trx_commit = 1  # 最高持久性

# 主库上创建复制用户
CREATE USER 'repl'@'192.168.1.%' IDENTIFIED BY 'Repl@passwd123';
GRANT REPLICATION SLAVE ON *.* TO 'repl'@'192.168.1.%';
FLUSH PRIVILEGES;

SHOW MASTER STATUS;
# +------------------+----------+
# | File             | Position |
# +------------------+----------+
# | mysql-bin.000001 |      154  |

# 从库配置 /etc/mysql/conf.d/replica.cnf
[mysqld]
server-id = 2
relay-log = relay-bin
read-only = ON            # 从库只读
super-read-only = ON      # 防止超级用户写入

# 从库上启动复制
CHANGE MASTER TO
    MASTER_HOST='192.168.1.1',
    MASTER_USER='repl',
    MASTER_PASSWORD='Repl@passwd123',
    MASTER_LOG_FILE='mysql-bin.000001',
    MASTER_LOG_POS=154;
START SLAVE;

SHOW SLAVE STATUS\G
# Slave_IO_Running: Yes
# Slave_SQL_Running: Yes
# Seconds_Behind_Master: 0    <- 延迟为0,复制正常

四、Nginx负载均衡配置

# /etc/nginx/conf.d/openclaw.conf

upstream openclaw_api {
    least_conn;  # 最少连接数负载均衡
    server 192.168.1.10:8080 weight=3;  # API节点1
    server 192.168.1.11:8080 weight=3;  # API节点2
    server 192.168.1.12:8080 weight=3;  # API节点3
    keepalive 32;  # 保持长连接减少握手开销
}

server {
    listen 443 ssl http2;
    server_name api.openclaw.yourdomain.com;

    ssl_certificate     /etc/letsencrypt/live/yourdomain.com/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/yourdomain.com/privkey.pem;

    # 广告请求超时配置
    proxy_connect_timeout 50ms;
    proxy_read_timeout    80ms;   # 总时限100ms内

    # 高并发优化
    proxy_http_version 1.1;
    proxy_set_header Connection '';

    location /api/v1/ad {
        proxy_pass http://openclaw_api;
        # 超时后返回兜底广告JSON
        proxy_next_upstream error timeout http_502 http_503;
        proxy_next_upstream_tries 2;
    }

    # 监控接口
    location /nginx_status {
        stub_status;
        allow 10.0.0.0/8;
        deny all;
    }
}

五、健康检查与自动化运维

# healthcheck.sh - 每分钟执行
#!/bin/bash

# 检查API节点
for host in 192.168.1.10 192.168.1.11 192.168.1.12; do
    if ! curl -sf http://$host:8080/health | grep -q '"status":"ok"'; then
        echo "[ALERT] API节点 $host 异常!"
        # 从Nginx upstream摘除
        nginx -s reload
    fi
done

# 检查MySQL主从延迟
LAG=$(mysql -h 192.168.1.2 -u monitor -pmonitor123 -e \
    'SHOW SLAVE STATUS\G' 2>/dev/null | grep 'Seconds_Behind_Master' | awk '{print $2}')
if [ "$LAG" -gt 5 ]; then
    echo "[WARN] MySQL主从延迟: ${LAG}秒"
fi

# 检查Redis集群
redis-cli -p 7001 cluster info | grep -q 'cluster_state:ok' || \
    echo '[ALERT] Redis集群状态异常'

总结:OpenClaw高可用的三个支柱——Redis Cluster保障高并发频控不挂,MySQL主从保障数据持久性,Nginx多实例保障接入层不停服。生产环境务必配置健康检查和告警,故障要在用户感知之前就被发现。

发表评论