155 lines
5.3 KiB
Bash
155 lines
5.3 KiB
Bash
#!/bin/bash
|
|
|
|
# Advanced Prometheus Configuration Script
|
|
# This script sets up a production-ready Prometheus configuration
|
|
# including alerting rules, rule files, and Alertmanager integration.
|
|
|
|
set -e
|
|
|
|
CONFIG_DIR="/etc/prometheus"
|
|
RULES_DIR="$CONFIG_DIR/rules"
|
|
DATA_DIR="$CONFIG_DIR/prometheus_data"
|
|
MAIN_CONFIG="$CONFIG_DIR/prometheus.yml"
|
|
|
|
# Ensure directories exist
|
|
sudo mkdir -p "$RULES_DIR"
|
|
sudo mkdir -p "$DATA_DIR"
|
|
|
|
echo "Configuring Advanced Prometheus Features..."
|
|
|
|
# 1. Create a Sample Alert Rule File
|
|
echo "Creating default alert rules in $RULES_DIR/node_alerts.yml..."
|
|
sudo tee "$RULES_DIR/node_alerts.yml" > /dev/null <<EOF
|
|
groups:
|
|
- name: node_exporter_alerts
|
|
rules:
|
|
# Tiered Alerting for Instance Downtime
|
|
# Level 1: Warning if down for 2 minutes
|
|
- alert: InstanceDownWarning
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Instance {{ \$labels.instance }} offline (Warning)"
|
|
description: "Target {{ \$labels.instance }} has been unreachable for over 2 minutes."
|
|
|
|
# Level 2: Critical if down for 5 minutes
|
|
- alert: InstanceDownCritical
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Instance {{ \$labels.instance }} offline (CRITICAL)"
|
|
description: "Crucial service node {{ \$labels.instance }} is DOWN for more than 5 minutes! Immediate action required."
|
|
|
|
# Alert for missing job entirely (e.g. no targets configured)
|
|
- alert: JobMissing
|
|
expr: absent(up{job="nodes"})
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Job {{ \$labels.job }} is missing"
|
|
description: "Prometheus is not receiving any data from the 'nodes' job. This usually means all targets are down or the configuration is broken."
|
|
|
|
# Alert for high CPU usage (>80%)
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ \$labels.instance }}"
|
|
description: "CPU usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
|
|
|
# Alert for high Memory usage (>85%)
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High Memory usage on {{ \$labels.instance }}"
|
|
description: "Memory usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
|
|
|
# Alert for high Disk usage (>90%)
|
|
- alert: HighDiskUsage
|
|
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High Disk usage on {{ \$labels.instance }}"
|
|
description: "Disk usage on / is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
|
|
EOF
|
|
|
|
# 2. Create the Advanced Main Configuration File
|
|
echo "Creating advanced prometheus.yml..."
|
|
sudo tee "$MAIN_CONFIG" > /dev/null <<EOF
|
|
global:
|
|
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
|
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
|
# scrape_timeout is set to the global default (10s).
|
|
|
|
# external_labels allows this Prometheus to be identified in a multi-Prometheus environment
|
|
external_labels:
|
|
monitor: 'master-monitor'
|
|
|
|
# Alerting specifies settings for Alertmanager
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- localhost:9093 # Default Alertmanager port
|
|
|
|
# rule_files specifies where to load alerting and recording rules
|
|
rule_files:
|
|
- "rules/*.yml"
|
|
|
|
# scrape_configs defines what targets Prometheus will scrape
|
|
scrape_configs:
|
|
# The prometheus self-monitoring job
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
# Basic node_exporter job
|
|
- job_name: 'nodes'
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets: ['localhost:9100']
|
|
|
|
# Remote Write (Example for external storage like Grafana Cloud, VictoriaMetrics, etc.)
|
|
# remote_write:
|
|
# - url: "https://your-remote-write-endpoint/api/v1/write"
|
|
# basic_auth:
|
|
# username: "your_user"
|
|
# password: "your_password"
|
|
|
|
# Example of a job with many targets (placeholder)
|
|
# - job_name: 'external_nodes'
|
|
# static_configs:
|
|
# - targets:
|
|
# - '192.168.1.100:9100'
|
|
# - '192.168.1.101:9100'
|
|
|
|
EOF
|
|
|
|
# 3. Apply changes by restarting Prometheus
|
|
if [ -x "/usr/bin/restart_prometheus" ]; then
|
|
sudo /usr/bin/restart_prometheus
|
|
else
|
|
sudo systemctl restart prometheus
|
|
fi
|
|
|
|
echo "-------------------------------------------------------"
|
|
echo "Advanced Prometheus configuration applied successfully!"
|
|
echo "Rules directory: $RULES_DIR"
|
|
echo "Alerting rules loaded from: node_alerts.yml"
|
|
echo "Alertmanager target set to: localhost:9093"
|
|
echo "-------------------------------------------------------"
|
|
echo "Note: If you haven't installed Alertmanager yet, you will see"
|
|
echo "errors in the Prometheus logs about connecting to 9093."
|