添加Prometheus高级配置脚本
This commit is contained in:
154
Prometheus/advanced_prometheus_config.sh
Normal file
154
Prometheus/advanced_prometheus_config.sh
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Advanced Prometheus Configuration Script
|
||||
# This script sets up a production-ready Prometheus configuration
|
||||
# including alerting rules, rule files, and Alertmanager integration.
|
||||
|
||||
set -e
|
||||
|
||||
CONFIG_DIR="/etc/prometheus"
|
||||
RULES_DIR="$CONFIG_DIR/rules"
|
||||
DATA_DIR="$CONFIG_DIR/prometheus_data"
|
||||
MAIN_CONFIG="$CONFIG_DIR/prometheus.yml"
|
||||
|
||||
# Ensure directories exist
|
||||
sudo mkdir -p "$RULES_DIR"
|
||||
sudo mkdir -p "$DATA_DIR"
|
||||
|
||||
echo "Configuring Advanced Prometheus Features..."
|
||||
|
||||
# 1. Create a Sample Alert Rule File
|
||||
echo "Creating default alert rules in $RULES_DIR/node_alerts.yml..."
|
||||
sudo tee "$RULES_DIR/node_alerts.yml" > /dev/null <<EOF
|
||||
groups:
|
||||
- name: node_exporter_alerts
|
||||
rules:
|
||||
# Tiered Alerting for Instance Downtime
|
||||
# Level 1: Warning if down for 2 minutes
|
||||
- alert: InstanceDownWarning
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Instance {{ \$labels.instance }} offline (Warning)"
|
||||
description: "Target {{ \$labels.instance }} has been unreachable for over 2 minutes."
|
||||
|
||||
# Level 2: Critical if down for 5 minutes
|
||||
- alert: InstanceDownCritical
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ \$labels.instance }} offline (CRITICAL)"
|
||||
description: "Crucial service node {{ \$labels.instance }} is DOWN for more than 5 minutes! Immediate action required."
|
||||
|
||||
# Alert for missing job entirely (e.g. no targets configured)
|
||||
- alert: JobMissing
|
||||
expr: absent(up{job="nodes"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Job {{ \$labels.job }} is missing"
|
||||
description: "Prometheus is not receiving any data from the 'nodes' job. This usually means all targets are down or the configuration is broken."
|
||||
|
||||
# Alert for high CPU usage (>80%)
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ \$labels.instance }}"
|
||||
description: "CPU usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
||||
|
||||
# Alert for high Memory usage (>85%)
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High Memory usage on {{ \$labels.instance }}"
|
||||
description: "Memory usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
||||
|
||||
# Alert for high Disk usage (>90%)
|
||||
- alert: HighDiskUsage
|
||||
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High Disk usage on {{ \$labels.instance }}"
|
||||
description: "Disk usage on / is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
||||
EOF
|
||||
|
||||
# 2. Create the Advanced Main Configuration File
|
||||
echo "Creating advanced prometheus.yml..."
|
||||
sudo tee "$MAIN_CONFIG" > /dev/null <<EOF
|
||||
global:
|
||||
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
||||
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# external_labels allows this Prometheus to be identified in a multi-Prometheus environment
|
||||
external_labels:
|
||||
monitor: 'master-monitor'
|
||||
|
||||
# Alerting specifies settings for Alertmanager
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- localhost:9093 # Default Alertmanager port
|
||||
|
||||
# rule_files specifies where to load alerting and recording rules
|
||||
rule_files:
|
||||
- "rules/*.yml"
|
||||
|
||||
# scrape_configs defines what targets Prometheus will scrape
|
||||
scrape_configs:
|
||||
# The prometheus self-monitoring job
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Basic node_exporter job
|
||||
- job_name: 'nodes'
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ['localhost:9100']
|
||||
|
||||
# Remote Write (Example for external storage like Grafana Cloud, VictoriaMetrics, etc.)
|
||||
# remote_write:
|
||||
# - url: "https://your-remote-write-endpoint/api/v1/write"
|
||||
# basic_auth:
|
||||
# username: "your_user"
|
||||
# password: "your_password"
|
||||
|
||||
# Example of a job with many targets (placeholder)
|
||||
# - job_name: 'external_nodes'
|
||||
# static_configs:
|
||||
# - targets:
|
||||
# - '192.168.1.100:9100'
|
||||
# - '192.168.1.101:9100'
|
||||
|
||||
EOF
|
||||
|
||||
# 3. Apply changes by restarting Prometheus
|
||||
if [ -x "/usr/bin/restart_prometheus" ]; then
|
||||
sudo /usr/bin/restart_prometheus
|
||||
else
|
||||
sudo systemctl restart prometheus
|
||||
fi
|
||||
|
||||
echo "-------------------------------------------------------"
|
||||
echo "Advanced Prometheus configuration applied successfully!"
|
||||
echo "Rules directory: $RULES_DIR"
|
||||
echo "Alerting rules loaded from: node_alerts.yml"
|
||||
echo "Alertmanager target set to: localhost:9093"
|
||||
echo "-------------------------------------------------------"
|
||||
echo "Note: If you haven't installed Alertmanager yet, you will see"
|
||||
echo "errors in the Prometheus logs about connecting to 9093."
|
||||
121
Prometheus/install_Alertmanager.sh
Normal file
121
Prometheus/install_Alertmanager.sh
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Alertmanager Installation and Configuration Script
|
||||
# This script installs Alertmanager and configures email notifications.
|
||||
|
||||
set -e
|
||||
|
||||
# Detect Operation System
|
||||
|
||||
if command -v apt >/dev/null 2>&1; then
|
||||
echo "Detected apt-based system"
|
||||
sudo apt update
|
||||
sudo apt install -y wget curl tar
|
||||
elif command -v dnf >/dev/null 2>&1; then
|
||||
echo "Detected dnf-based system"
|
||||
sudo dnf install -y wget curl tar
|
||||
else
|
||||
echo "Unsupported package manager"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Download Alertmanager
|
||||
VERSION="0.27.0"
|
||||
CN_URL="https://s3.cloudyun.top/downloads/alertmanager-${VERSION}.linux-amd64.tar.gz"
|
||||
GLOBAL_URL="https://github.com/prometheus/alertmanager/releases/download/v${VERSION}/alertmanager-${VERSION}.linux-amd64.tar.gz"
|
||||
TARGET="/tmp/alertmanager.tar.gz"
|
||||
|
||||
is_cn=false
|
||||
echo "Detecting geographic location..."
|
||||
COUNTRY=$(curl -s --max-time 3 https://ipinfo.littlediary.cn/country || true)
|
||||
if [ "$COUNTRY" = "CN" ]; then
|
||||
is_cn=true
|
||||
DOWNLOAD_URL="$CN_URL"
|
||||
else
|
||||
DOWNLOAD_URL="$GLOBAL_URL"
|
||||
fi
|
||||
|
||||
echo "Downloading from: $DOWNLOAD_URL"
|
||||
curl -fL -o "$TARGET" "$DOWNLOAD_URL"
|
||||
|
||||
# Extract and Install
|
||||
echo "Extracting Alertmanager..."
|
||||
tar -zxvf "$TARGET" -C /tmp
|
||||
sudo mkdir -p /etc/alertmanager
|
||||
sudo cp /tmp/alertmanager-${VERSION}.linux-amd64/alertmanager /usr/bin/
|
||||
sudo cp /tmp/alertmanager-${VERSION}.linux-amd64/amtool /usr/bin/
|
||||
|
||||
# Arguments for SMTP
|
||||
SMTP_HOST="smtp.example.com:587"
|
||||
SMTP_USER="user@example.com"
|
||||
SMTP_PASS="password"
|
||||
SMTP_FROM="alertmanager@example.com"
|
||||
EMAIL_TO="recipient@example.com"
|
||||
|
||||
# Interactive SMTP Configuration
|
||||
echo "--- Alertmanager Email Setup ---"
|
||||
read -p "Do you want to enable Email Notifications? [y/N]: " ENABLE_EMAIL
|
||||
if [[ "$ENABLE_EMAIL" =~ ^[Yy]$ ]]; then
|
||||
read -p "Enter SMTP Host (e.g. smtp.qq.com:465): " SMTP_HOST
|
||||
read -p "Enter SMTP Authentication Username: " SMTP_USER
|
||||
read -s -p "Enter SMTP Authentication Password: " SMTP_PASS
|
||||
echo "" # New line after hidden password
|
||||
read -p "Enter Sender Email (e.g. noreply@domain.com): " SMTP_FROM
|
||||
read -p "Enter Recipient Email: " EMAIL_TO
|
||||
fi
|
||||
|
||||
# Create Configuration with Email Support
|
||||
echo "Creating alertmanager.yml..."
|
||||
sudo tee "/etc/alertmanager/alertmanager.yml" > /dev/null <<EOF
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_smarthost: '$SMTP_HOST'
|
||||
smtp_from: '$SMTP_FROM'
|
||||
smtp_auth_username: '$SMTP_USER'
|
||||
smtp_auth_password: '$SMTP_PASS'
|
||||
smtp_require_tls: false # Set to true for 587/TLS, false if using SSL/465
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 1h
|
||||
receiver: 'email-notifications'
|
||||
|
||||
receivers:
|
||||
- name: 'email-notifications'
|
||||
email_configs:
|
||||
- to: '$EMAIL_TO'
|
||||
send_resolved: true
|
||||
EOF
|
||||
|
||||
# Create systemd service
|
||||
echo "Creating systemd service for Alertmanager..."
|
||||
sudo tee "/etc/systemd/system/alertmanager.service" > /dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Alertmanager
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
Group=root
|
||||
Type=simple
|
||||
ExecStart=/usr/bin/alertmanager \\
|
||||
--config.file=/etc/alertmanager/alertmanager.yml \\
|
||||
--storage.path=/etc/alertmanager/data
|
||||
Restart=always
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# Reload and Start
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now alertmanager.service
|
||||
|
||||
echo "--------------------------------------------------------"
|
||||
echo "Alertmanager installed and configured with EMAIL support!"
|
||||
echo "Configuration File: /etc/alertmanager/alertmanager.yml"
|
||||
echo "Please edit the configuration to set your SMTP details."
|
||||
echo "--------------------------------------------------------"
|
||||
Reference in New Issue
Block a user