Files
Linux-Shell/Prometheus/advanced_prometheus_config.sh
2026-04-06 21:07:22 +08:00

155 lines
5.3 KiB
Bash

#!/bin/bash
# Advanced Prometheus Configuration Script
# This script sets up a production-ready Prometheus configuration
# including alerting rules, rule files, and Alertmanager integration.
set -e
CONFIG_DIR="/etc/prometheus"
RULES_DIR="$CONFIG_DIR/rules"
DATA_DIR="$CONFIG_DIR/prometheus_data"
MAIN_CONFIG="$CONFIG_DIR/prometheus.yml"
# Ensure directories exist
sudo mkdir -p "$RULES_DIR"
sudo mkdir -p "$DATA_DIR"
echo "Configuring Advanced Prometheus Features..."
# 1. Create a Sample Alert Rule File
echo "Creating default alert rules in $RULES_DIR/node_alerts.yml..."
sudo tee "$RULES_DIR/node_alerts.yml" > /dev/null <<EOF
groups:
- name: node_exporter_alerts
rules:
# Tiered Alerting for Instance Downtime
# Level 1: Warning if down for 2 minutes
- alert: InstanceDownWarning
expr: up == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Instance {{ \$labels.instance }} offline (Warning)"
description: "Target {{ \$labels.instance }} has been unreachable for over 2 minutes."
# Level 2: Critical if down for 5 minutes
- alert: InstanceDownCritical
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ \$labels.instance }} offline (CRITICAL)"
description: "Crucial service node {{ \$labels.instance }} is DOWN for more than 5 minutes! Immediate action required."
# Alert for missing job entirely (e.g. no targets configured)
- alert: JobMissing
expr: absent(up{job="nodes"})
for: 5m
labels:
severity: critical
annotations:
summary: "Job {{ \$labels.job }} is missing"
description: "Prometheus is not receiving any data from the 'nodes' job. This usually means all targets are down or the configuration is broken."
# Alert for high CPU usage (>80%)
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ \$labels.instance }}"
description: "CPU usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
# Alert for high Memory usage (>85%)
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High Memory usage on {{ \$labels.instance }}"
description: "Memory usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
# Alert for high Disk usage (>90%)
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "High Disk usage on {{ \$labels.instance }}"
description: "Disk usage on / is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
EOF
# 2. Create the Advanced Main Configuration File
echo "Creating advanced prometheus.yml..."
sudo tee "$MAIN_CONFIG" > /dev/null <<EOF
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# external_labels allows this Prometheus to be identified in a multi-Prometheus environment
external_labels:
monitor: 'master-monitor'
# Alerting specifies settings for Alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093 # Default Alertmanager port
# rule_files specifies where to load alerting and recording rules
rule_files:
- "rules/*.yml"
# scrape_configs defines what targets Prometheus will scrape
scrape_configs:
# The prometheus self-monitoring job
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Basic node_exporter job
- job_name: 'nodes'
scrape_interval: 5s
static_configs:
- targets: ['localhost:9100']
# Remote Write (Example for external storage like Grafana Cloud, VictoriaMetrics, etc.)
# remote_write:
# - url: "https://your-remote-write-endpoint/api/v1/write"
# basic_auth:
# username: "your_user"
# password: "your_password"
# Example of a job with many targets (placeholder)
# - job_name: 'external_nodes'
# static_configs:
# - targets:
# - '192.168.1.100:9100'
# - '192.168.1.101:9100'
EOF
# 3. Apply changes by restarting Prometheus
if [ -x "/usr/bin/restart_prometheus" ]; then
sudo /usr/bin/restart_prometheus
else
sudo systemctl restart prometheus
fi
echo "-------------------------------------------------------"
echo "Advanced Prometheus configuration applied successfully!"
echo "Rules directory: $RULES_DIR"
echo "Alerting rules loaded from: node_alerts.yml"
echo "Alertmanager target set to: localhost:9093"
echo "-------------------------------------------------------"
echo "Note: If you haven't installed Alertmanager yet, you will see"
echo "errors in the Prometheus logs about connecting to 9093."