添加Prometheus高级配置脚本
This commit is contained in:
154
Prometheus/advanced_prometheus_config.sh
Normal file
154
Prometheus/advanced_prometheus_config.sh
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Advanced Prometheus Configuration Script
|
||||||
|
# This script sets up a production-ready Prometheus configuration
|
||||||
|
# including alerting rules, rule files, and Alertmanager integration.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CONFIG_DIR="/etc/prometheus"
|
||||||
|
RULES_DIR="$CONFIG_DIR/rules"
|
||||||
|
DATA_DIR="$CONFIG_DIR/prometheus_data"
|
||||||
|
MAIN_CONFIG="$CONFIG_DIR/prometheus.yml"
|
||||||
|
|
||||||
|
# Ensure directories exist
|
||||||
|
sudo mkdir -p "$RULES_DIR"
|
||||||
|
sudo mkdir -p "$DATA_DIR"
|
||||||
|
|
||||||
|
echo "Configuring Advanced Prometheus Features..."
|
||||||
|
|
||||||
|
# 1. Create a Sample Alert Rule File
|
||||||
|
echo "Creating default alert rules in $RULES_DIR/node_alerts.yml..."
|
||||||
|
sudo tee "$RULES_DIR/node_alerts.yml" > /dev/null <<EOF
|
||||||
|
groups:
|
||||||
|
- name: node_exporter_alerts
|
||||||
|
rules:
|
||||||
|
# Tiered Alerting for Instance Downtime
|
||||||
|
# Level 1: Warning if down for 2 minutes
|
||||||
|
- alert: InstanceDownWarning
|
||||||
|
expr: up == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ \$labels.instance }} offline (Warning)"
|
||||||
|
description: "Target {{ \$labels.instance }} has been unreachable for over 2 minutes."
|
||||||
|
|
||||||
|
# Level 2: Critical if down for 5 minutes
|
||||||
|
- alert: InstanceDownCritical
|
||||||
|
expr: up == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ \$labels.instance }} offline (CRITICAL)"
|
||||||
|
description: "Crucial service node {{ \$labels.instance }} is DOWN for more than 5 minutes! Immediate action required."
|
||||||
|
|
||||||
|
# Alert for missing job entirely (e.g. no targets configured)
|
||||||
|
- alert: JobMissing
|
||||||
|
expr: absent(up{job="nodes"})
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Job {{ \$labels.job }} is missing"
|
||||||
|
description: "Prometheus is not receiving any data from the 'nodes' job. This usually means all targets are down or the configuration is broken."
|
||||||
|
|
||||||
|
# Alert for high CPU usage (>80%)
|
||||||
|
- alert: HighCPUUsage
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU usage on {{ \$labels.instance }}"
|
||||||
|
description: "CPU usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
||||||
|
|
||||||
|
# Alert for high Memory usage (>85%)
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High Memory usage on {{ \$labels.instance }}"
|
||||||
|
description: "Memory usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
||||||
|
|
||||||
|
# Alert for high Disk usage (>90%)
|
||||||
|
- alert: HighDiskUsage
|
||||||
|
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "High Disk usage on {{ \$labels.instance }}"
|
||||||
|
description: "Disk usage on / is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 2. Create the Advanced Main Configuration File
|
||||||
|
echo "Creating advanced prometheus.yml..."
|
||||||
|
sudo tee "$MAIN_CONFIG" > /dev/null <<EOF
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
||||||
|
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
||||||
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
|
# external_labels allows this Prometheus to be identified in a multi-Prometheus environment
|
||||||
|
external_labels:
|
||||||
|
monitor: 'master-monitor'
|
||||||
|
|
||||||
|
# Alerting specifies settings for Alertmanager
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- localhost:9093 # Default Alertmanager port
|
||||||
|
|
||||||
|
# rule_files specifies where to load alerting and recording rules
|
||||||
|
rule_files:
|
||||||
|
- "rules/*.yml"
|
||||||
|
|
||||||
|
# scrape_configs defines what targets Prometheus will scrape
|
||||||
|
scrape_configs:
|
||||||
|
# The prometheus self-monitoring job
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
# Basic node_exporter job
|
||||||
|
- job_name: 'nodes'
|
||||||
|
scrape_interval: 5s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9100']
|
||||||
|
|
||||||
|
# Remote Write (Example for external storage like Grafana Cloud, VictoriaMetrics, etc.)
|
||||||
|
# remote_write:
|
||||||
|
# - url: "https://your-remote-write-endpoint/api/v1/write"
|
||||||
|
# basic_auth:
|
||||||
|
# username: "your_user"
|
||||||
|
# password: "your_password"
|
||||||
|
|
||||||
|
# Example of a job with many targets (placeholder)
|
||||||
|
# - job_name: 'external_nodes'
|
||||||
|
# static_configs:
|
||||||
|
# - targets:
|
||||||
|
# - '192.168.1.100:9100'
|
||||||
|
# - '192.168.1.101:9100'
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 3. Apply changes by restarting Prometheus
|
||||||
|
if [ -x "/usr/bin/restart_prometheus" ]; then
|
||||||
|
sudo /usr/bin/restart_prometheus
|
||||||
|
else
|
||||||
|
sudo systemctl restart prometheus
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "-------------------------------------------------------"
|
||||||
|
echo "Advanced Prometheus configuration applied successfully!"
|
||||||
|
echo "Rules directory: $RULES_DIR"
|
||||||
|
echo "Alerting rules loaded from: node_alerts.yml"
|
||||||
|
echo "Alertmanager target set to: localhost:9093"
|
||||||
|
echo "-------------------------------------------------------"
|
||||||
|
echo "Note: If you haven't installed Alertmanager yet, you will see"
|
||||||
|
echo "errors in the Prometheus logs about connecting to 9093."
|
||||||
121
Prometheus/install_Alertmanager.sh
Normal file
121
Prometheus/install_Alertmanager.sh
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Alertmanager Installation and Configuration Script
|
||||||
|
# This script installs Alertmanager and configures email notifications.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Detect Operation System
|
||||||
|
|
||||||
|
if command -v apt >/dev/null 2>&1; then
|
||||||
|
echo "Detected apt-based system"
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y wget curl tar
|
||||||
|
elif command -v dnf >/dev/null 2>&1; then
|
||||||
|
echo "Detected dnf-based system"
|
||||||
|
sudo dnf install -y wget curl tar
|
||||||
|
else
|
||||||
|
echo "Unsupported package manager"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Download Alertmanager
|
||||||
|
VERSION="0.27.0"
|
||||||
|
CN_URL="https://s3.cloudyun.top/downloads/alertmanager-${VERSION}.linux-amd64.tar.gz"
|
||||||
|
GLOBAL_URL="https://github.com/prometheus/alertmanager/releases/download/v${VERSION}/alertmanager-${VERSION}.linux-amd64.tar.gz"
|
||||||
|
TARGET="/tmp/alertmanager.tar.gz"
|
||||||
|
|
||||||
|
is_cn=false
|
||||||
|
echo "Detecting geographic location..."
|
||||||
|
COUNTRY=$(curl -s --max-time 3 https://ipinfo.littlediary.cn/country || true)
|
||||||
|
if [ "$COUNTRY" = "CN" ]; then
|
||||||
|
is_cn=true
|
||||||
|
DOWNLOAD_URL="$CN_URL"
|
||||||
|
else
|
||||||
|
DOWNLOAD_URL="$GLOBAL_URL"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Downloading from: $DOWNLOAD_URL"
|
||||||
|
curl -fL -o "$TARGET" "$DOWNLOAD_URL"
|
||||||
|
|
||||||
|
# Extract and Install
|
||||||
|
echo "Extracting Alertmanager..."
|
||||||
|
tar -zxvf "$TARGET" -C /tmp
|
||||||
|
sudo mkdir -p /etc/alertmanager
|
||||||
|
sudo cp /tmp/alertmanager-${VERSION}.linux-amd64/alertmanager /usr/bin/
|
||||||
|
sudo cp /tmp/alertmanager-${VERSION}.linux-amd64/amtool /usr/bin/
|
||||||
|
|
||||||
|
# Arguments for SMTP
|
||||||
|
SMTP_HOST="smtp.example.com:587"
|
||||||
|
SMTP_USER="user@example.com"
|
||||||
|
SMTP_PASS="password"
|
||||||
|
SMTP_FROM="alertmanager@example.com"
|
||||||
|
EMAIL_TO="recipient@example.com"
|
||||||
|
|
||||||
|
# Interactive SMTP Configuration
|
||||||
|
echo "--- Alertmanager Email Setup ---"
|
||||||
|
read -p "Do you want to enable Email Notifications? [y/N]: " ENABLE_EMAIL
|
||||||
|
if [[ "$ENABLE_EMAIL" =~ ^[Yy]$ ]]; then
|
||||||
|
read -p "Enter SMTP Host (e.g. smtp.qq.com:465): " SMTP_HOST
|
||||||
|
read -p "Enter SMTP Authentication Username: " SMTP_USER
|
||||||
|
read -s -p "Enter SMTP Authentication Password: " SMTP_PASS
|
||||||
|
echo "" # New line after hidden password
|
||||||
|
read -p "Enter Sender Email (e.g. noreply@domain.com): " SMTP_FROM
|
||||||
|
read -p "Enter Recipient Email: " EMAIL_TO
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create Configuration with Email Support
|
||||||
|
echo "Creating alertmanager.yml..."
|
||||||
|
sudo tee "/etc/alertmanager/alertmanager.yml" > /dev/null <<EOF
|
||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
smtp_smarthost: '$SMTP_HOST'
|
||||||
|
smtp_from: '$SMTP_FROM'
|
||||||
|
smtp_auth_username: '$SMTP_USER'
|
||||||
|
smtp_auth_password: '$SMTP_PASS'
|
||||||
|
smtp_require_tls: false # Set to true for 587/TLS, false if using SSL/465
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname']
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 1h
|
||||||
|
receiver: 'email-notifications'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'email-notifications'
|
||||||
|
email_configs:
|
||||||
|
- to: '$EMAIL_TO'
|
||||||
|
send_resolved: true
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Create systemd service
|
||||||
|
echo "Creating systemd service for Alertmanager..."
|
||||||
|
sudo tee "/etc/systemd/system/alertmanager.service" > /dev/null <<EOF
|
||||||
|
[Unit]
|
||||||
|
Description=Alertmanager
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=root
|
||||||
|
Group=root
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/usr/bin/alertmanager \\
|
||||||
|
--config.file=/etc/alertmanager/alertmanager.yml \\
|
||||||
|
--storage.path=/etc/alertmanager/data
|
||||||
|
Restart=always
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Reload and Start
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable --now alertmanager.service
|
||||||
|
|
||||||
|
echo "--------------------------------------------------------"
|
||||||
|
echo "Alertmanager installed and configured with EMAIL support!"
|
||||||
|
echo "Configuration File: /etc/alertmanager/alertmanager.yml"
|
||||||
|
echo "Please edit the configuration to set your SMTP details."
|
||||||
|
echo "--------------------------------------------------------"
|
||||||
Reference in New Issue
Block a user