00:00:00
架构概览
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
│ Targets │────▶│ Prometheus │────▶│ Grafana │
│ (Exporters)│ │ (存储+查询) │ │ (可视化) │
└─────────────┘ └──────────────┘ └─────────────┘
│
▼
┌──────────────┐
│ Alertmanager │
│ (告警) │
└──────────────┘1. Prometheus 安装部署
1.1 Docker 方式部署
bash
# 创建配置文件目录
mkdir -p /opt/prometheus/{config,data}
# 创建 prometheus.yml 配置文件
cat > /opt/prometheus/config/prometheus.yml <<EOF
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node_exporter'
static_configs:
- targets: ['node-exporter:9100']
EOF
# 启动 Prometheus
docker run -d \
--name prometheus \
-p 9090:9090 \
-v /opt/prometheus/config:/etc/prometheus \
-v /opt/prometheus/data:/prometheus \
prom/prometheus:latest \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus \
--web.enable-lifecycle1.2 Docker Compose 方式(推荐)
yaml
# docker-compose.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-lifecycle'
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
ports:
- "9100:9100"
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
restart: unless-stopped
volumes:
prometheus_data:
grafana_data:bash
# 启动服务
docker-compose up -d
# 查看状态
docker-compose ps2. 常用 Exporter 配置
2.1 Node Exporter(主机监控)
bash
# 启动 Node Exporter
docker run -d \
--name node-exporter \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
prom/node-exporter:latest \
--path.rootfs=/hostPrometheus 配置:
yaml
scrape_configs:
- job_name: 'node'
static_configs:
- targets: ['192.168.1.10:9100', '192.168.1.11:9100']
labels:
env: 'production'2.2 MySQL Exporter(数据库监控)
bash
# 创建数据库监控用户
CREATE USER 'exporter'@'%' IDENTIFIED BY 'password';
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'%';
# 启动 MySQL Exporter
docker run -d \
--name mysqld-exporter \
-p 9104:9104 \
-e DATA_SOURCE_NAME="exporter:password@(mysql-host:3306)/" \
prom/mysqld-exporter:latest2.3 Redis Exporter
bash
docker run -d \
--name redis-exporter \
-p 9121:9121 \
oliver006/redis_exporter:latest \
--redis.addr=redis://redis-host:63792.4 Blackbox Exporter(黑盒监控)
yaml
# blackbox.yml
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200]
method: GETbash
docker run -d \
--name blackbox-exporter \
-p 9115:9115 \
-v /path/to/blackbox.yml:/etc/blackbox_exporter/config.yml \
prom/blackbox-exporter:latestPrometheus 配置:
yaml
scrape_configs:
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://example.com
- https://api.example.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:91153. Grafana 配置
3.1 添加 Prometheus 数据源
- 访问 Grafana:
http://localhost:3000 - 默认账号:
admin/admin - Configuration → Data Sources → Add data source
- 选择 Prometheus
- 填写 URL:
http://prometheus:9090 - 点击 Save & Test
3.2 导入常用 Dashboard
推荐 Dashboard ID:
| Dashboard | ID | 用途 |
|---|---|---|
| Node Exporter Full | 1860 | Linux 主机监控 |
| MySQL Overview | 7362 | MySQL 数据库监控 |
| Redis Dashboard | 11835 | Redis 监控 |
| Docker and System Monitoring | 893 | Docker 容器监控 |
| Blackbox Exporter | 7587 | HTTP 探测监控 |
导入步骤:
1. Dashboards → Import
2. 输入 Dashboard ID (如 1860)
3. 选择 Prometheus 数据源
4. 点击 Import4. 告警规则配置
4.1 创建告警规则文件
yaml
# /opt/prometheus/config/rules.yml
groups:
- name: node_alerts
interval: 30s
rules:
# 主机 CPU 使用率超过 80%
- alert: HighCpuUsage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "主机 CPU 使用率过高"
description: "{{ $labels.instance }} CPU 使用率为 {{ $value }}%"
# 主机内存使用率超过 85%
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "主机内存使用率过高"
description: "{{ $labels.instance }} 内存使用率为 {{ $value }}%"
# 磁盘使用率超过 85%
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘使用率过高"
description: "{{ $labels.instance }} 磁盘 {{ $labels.mountpoint }} 使用率为 {{ $value }}%"
# 主机宕机
- alert: HostDown
expr: up{job="node"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "主机宕机"
description: "{{ $labels.instance }} 已宕机超过 1 分钟"
- name: service_alerts
interval: 30s
rules:
# HTTP 服务不可用
- alert: ServiceDown
expr: probe_success{job="blackbox"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "服务不可用"
description: "{{ $labels.instance }} 探测失败"
# HTTP 响应时间过长
- alert: SlowResponse
expr: probe_http_duration_seconds > 3
for: 5m
labels:
severity: warning
annotations:
summary: "HTTP 响应时间过长"
description: "{{ $labels.instance }} 响应时间为 {{ $value }}s"4.2 更新 Prometheus 配置
yaml
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "/etc/prometheus/rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
scrape_configs:
# ... 之前的配置4.3 重载配置
bash
# 方式1:通过 API 重载
curl -X POST http://localhost:9090/-/reload
# 方式2:重启容器
docker restart prometheus5. Alertmanager 配置
5.1 基础配置
yaml
# alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
receivers:
- name: 'default'
webhook_configs:
- url: 'http://webhook-server:8080/alert'5.2 企业微信告警
yaml
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'your_corp_id'
to_party: '1'
agent_id: 'your_agent_id'
api_secret: 'your_secret'
send_resolved: true5.3 钉钉告警
yaml
receivers:
- name: 'dingtalk'
webhook_configs:
- url: 'https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN'
send_resolved: true6. 常用 PromQL 查询
promql
# CPU 使用率
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# 内存使用率
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# 磁盘使用率
(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100
# 网络流量(入站)
irate(node_network_receive_bytes_total[5m])
# 网络流量(出站)
irate(node_network_transmit_bytes_total[5m])
# HTTP 请求速率
rate(http_requests_total[5m])
# 服务可用性(最近 1 小时)
avg_over_time(up[1h]) * 1007. 性能优化建议
7.1 Prometheus 优化
yaml
# 数据保留时间
--storage.tsdb.retention.time=15d
# 数据保留大小
--storage.tsdb.retention.size=50GB
# 减少采集频率(非关键指标)
scrape_interval: 60s7.2 Grafana 优化
ini
# /etc/grafana/grafana.ini
[database]
type = mysql
host = mysql:3306
name = grafana
user = grafana
password = password
[server]
enable_gzip = true
[caching]
enabled = true8. 备份与恢复
bash
# 备份 Prometheus 数据
tar -czf prometheus_backup_$(date +%Y%m%d).tar.gz /opt/prometheus/data/
# 备份 Grafana 数据
docker exec grafana grafana-cli admin reset-admin-password newpassword
docker cp grafana:/var/lib/grafana grafana_backup_$(date +%Y%m%d)
# 恢复
docker cp grafana_backup/grafana grafana:/var/lib/
docker restart grafana