添加Prometheus高级配置脚本

This commit is contained in:
CN-JS-HuiBai
2026-04-06 21:07:22 +08:00
parent bc78a1f601
commit a13ccc7bdc
2 changed files with 275 additions and 0 deletions

View File

@@ -0,0 +1,154 @@
#!/bin/bash
# Advanced Prometheus Configuration Script
# This script sets up a production-ready Prometheus configuration
# including alerting rules, rule files, and Alertmanager integration.
set -e
CONFIG_DIR="/etc/prometheus"
RULES_DIR="$CONFIG_DIR/rules"
DATA_DIR="$CONFIG_DIR/prometheus_data"
MAIN_CONFIG="$CONFIG_DIR/prometheus.yml"
# Ensure directories exist
sudo mkdir -p "$RULES_DIR"
sudo mkdir -p "$DATA_DIR"
echo "Configuring Advanced Prometheus Features..."
# 1. Create a Sample Alert Rule File
echo "Creating default alert rules in $RULES_DIR/node_alerts.yml..."
sudo tee "$RULES_DIR/node_alerts.yml" > /dev/null <<EOF
groups:
- name: node_exporter_alerts
rules:
# Tiered Alerting for Instance Downtime
# Level 1: Warning if down for 2 minutes
- alert: InstanceDownWarning
expr: up == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Instance {{ \$labels.instance }} offline (Warning)"
description: "Target {{ \$labels.instance }} has been unreachable for over 2 minutes."
# Level 2: Critical if down for 5 minutes
- alert: InstanceDownCritical
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ \$labels.instance }} offline (CRITICAL)"
description: "Crucial service node {{ \$labels.instance }} is DOWN for more than 5 minutes! Immediate action required."
# Alert for missing job entirely (e.g. no targets configured)
- alert: JobMissing
expr: absent(up{job="nodes"})
for: 5m
labels:
severity: critical
annotations:
summary: "Job {{ \$labels.job }} is missing"
description: "Prometheus is not receiving any data from the 'nodes' job. This usually means all targets are down or the configuration is broken."
# Alert for high CPU usage (>80%)
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ \$labels.instance }}"
description: "CPU usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
# Alert for high Memory usage (>85%)
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High Memory usage on {{ \$labels.instance }}"
description: "Memory usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
# Alert for high Disk usage (>90%)
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "High Disk usage on {{ \$labels.instance }}"
description: "Disk usage on / is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
EOF
# 2. Create the Advanced Main Configuration File
echo "Creating advanced prometheus.yml..."
sudo tee "$MAIN_CONFIG" > /dev/null <<EOF
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# external_labels allows this Prometheus to be identified in a multi-Prometheus environment
external_labels:
monitor: 'master-monitor'
# Alerting specifies settings for Alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093 # Default Alertmanager port
# rule_files specifies where to load alerting and recording rules
rule_files:
- "rules/*.yml"
# scrape_configs defines what targets Prometheus will scrape
scrape_configs:
# The prometheus self-monitoring job
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Basic node_exporter job
- job_name: 'nodes'
scrape_interval: 5s
static_configs:
- targets: ['localhost:9100']
# Remote Write (Example for external storage like Grafana Cloud, VictoriaMetrics, etc.)
# remote_write:
# - url: "https://your-remote-write-endpoint/api/v1/write"
# basic_auth:
# username: "your_user"
# password: "your_password"
# Example of a job with many targets (placeholder)
# - job_name: 'external_nodes'
# static_configs:
# - targets:
# - '192.168.1.100:9100'
# - '192.168.1.101:9100'
EOF
# 3. Apply changes by restarting Prometheus
if [ -x "/usr/bin/restart_prometheus" ]; then
sudo /usr/bin/restart_prometheus
else
sudo systemctl restart prometheus
fi
echo "-------------------------------------------------------"
echo "Advanced Prometheus configuration applied successfully!"
echo "Rules directory: $RULES_DIR"
echo "Alerting rules loaded from: node_alerts.yml"
echo "Alertmanager target set to: localhost:9093"
echo "-------------------------------------------------------"
echo "Note: If you haven't installed Alertmanager yet, you will see"
echo "errors in the Prometheus logs about connecting to 9093."

View File

@@ -0,0 +1,121 @@
#!/bin/bash
# Alertmanager Installation and Configuration Script
# This script installs Alertmanager and configures email notifications.
set -e
# Detect Operation System
if command -v apt >/dev/null 2>&1; then
echo "Detected apt-based system"
sudo apt update
sudo apt install -y wget curl tar
elif command -v dnf >/dev/null 2>&1; then
echo "Detected dnf-based system"
sudo dnf install -y wget curl tar
else
echo "Unsupported package manager"
exit 1
fi
# Download Alertmanager
VERSION="0.27.0"
CN_URL="https://s3.cloudyun.top/downloads/alertmanager-${VERSION}.linux-amd64.tar.gz"
GLOBAL_URL="https://github.com/prometheus/alertmanager/releases/download/v${VERSION}/alertmanager-${VERSION}.linux-amd64.tar.gz"
TARGET="/tmp/alertmanager.tar.gz"
is_cn=false
echo "Detecting geographic location..."
COUNTRY=$(curl -s --max-time 3 https://ipinfo.littlediary.cn/country || true)
if [ "$COUNTRY" = "CN" ]; then
is_cn=true
DOWNLOAD_URL="$CN_URL"
else
DOWNLOAD_URL="$GLOBAL_URL"
fi
echo "Downloading from: $DOWNLOAD_URL"
curl -fL -o "$TARGET" "$DOWNLOAD_URL"
# Extract and Install
echo "Extracting Alertmanager..."
tar -zxvf "$TARGET" -C /tmp
sudo mkdir -p /etc/alertmanager
sudo cp /tmp/alertmanager-${VERSION}.linux-amd64/alertmanager /usr/bin/
sudo cp /tmp/alertmanager-${VERSION}.linux-amd64/amtool /usr/bin/
# Arguments for SMTP
SMTP_HOST="smtp.example.com:587"
SMTP_USER="user@example.com"
SMTP_PASS="password"
SMTP_FROM="alertmanager@example.com"
EMAIL_TO="recipient@example.com"
# Interactive SMTP Configuration
echo "--- Alertmanager Email Setup ---"
read -p "Do you want to enable Email Notifications? [y/N]: " ENABLE_EMAIL
if [[ "$ENABLE_EMAIL" =~ ^[Yy]$ ]]; then
read -p "Enter SMTP Host (e.g. smtp.qq.com:465): " SMTP_HOST
read -p "Enter SMTP Authentication Username: " SMTP_USER
read -s -p "Enter SMTP Authentication Password: " SMTP_PASS
echo "" # New line after hidden password
read -p "Enter Sender Email (e.g. noreply@domain.com): " SMTP_FROM
read -p "Enter Recipient Email: " EMAIL_TO
fi
# Create Configuration with Email Support
echo "Creating alertmanager.yml..."
sudo tee "/etc/alertmanager/alertmanager.yml" > /dev/null <<EOF
global:
resolve_timeout: 5m
smtp_smarthost: '$SMTP_HOST'
smtp_from: '$SMTP_FROM'
smtp_auth_username: '$SMTP_USER'
smtp_auth_password: '$SMTP_PASS'
smtp_require_tls: false # Set to true for 587/TLS, false if using SSL/465
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 5m
repeat_interval: 1h
receiver: 'email-notifications'
receivers:
- name: 'email-notifications'
email_configs:
- to: '$EMAIL_TO'
send_resolved: true
EOF
# Create systemd service
echo "Creating systemd service for Alertmanager..."
sudo tee "/etc/systemd/system/alertmanager.service" > /dev/null <<EOF
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Type=simple
ExecStart=/usr/bin/alertmanager \\
--config.file=/etc/alertmanager/alertmanager.yml \\
--storage.path=/etc/alertmanager/data
Restart=always
[Install]
WantedBy=multi-user.target
EOF
# Reload and Start
sudo systemctl daemon-reload
sudo systemctl enable --now alertmanager.service
echo "--------------------------------------------------------"
echo "Alertmanager installed and configured with EMAIL support!"
echo "Configuration File: /etc/alertmanager/alertmanager.yml"
echo "Please edit the configuration to set your SMTP details."
echo "--------------------------------------------------------"