Linux-Shell/Prometheus/advanced_prometheus_config.sh

#!/bin/bash

# Advanced Prometheus Configuration Script
# This script sets up a production-ready Prometheus configuration
# including alerting rules, rule files, and Alertmanager integration.

set -e

CONFIG_DIR="/etc/prometheus"
RULES_DIR="$CONFIG_DIR/rules"
DATA_DIR="$CONFIG_DIR/prometheus_data"
MAIN_CONFIG="$CONFIG_DIR/prometheus.yml"

# Ensure directories exist
sudo mkdir -p "$RULES_DIR"
sudo mkdir -p "$DATA_DIR"

echo "Configuring Advanced Prometheus Features..."

# 1. Create a Sample Alert Rule File
echo "Creating default alert rules in $RULES_DIR/node_alerts.yml..."
sudo tee "$RULES_DIR/node_alerts.yml" > /dev/null <<EOF
groups:
  - name: node_exporter_alerts
    rules:
      # Tiered Alerting for Instance Downtime
      # Level 1: Warning if down for 2 minutes
      - alert: InstanceDownWarning
        expr: up == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ \$labels.instance }} offline (Warning)"
          description: "Target {{ \$labels.instance }} has been unreachable for over 2 minutes."

      # Level 2: Critical if down for 5 minutes
      - alert: InstanceDownCritical
        expr: up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ \$labels.instance }} offline (CRITICAL)"
          description: "Crucial service node {{ \$labels.instance }} is DOWN for more than 5 minutes! Immediate action required."

      # Alert for missing job entirely (e.g. no targets configured)
      - alert: JobMissing
        expr: absent(up{job="nodes"})
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Job {{ \$labels.job }} is missing"
          description: "Prometheus is not receiving any data from the 'nodes' job. This usually means all targets are down or the configuration is broken."

      # Alert for high CPU usage (>80%)
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ \$labels.instance }}"
          description: "CPU usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."

      # Alert for high Memory usage (>85%)
      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Memory usage on {{ \$labels.instance }}"
          description: "Memory usage is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."

      # Alert for high Disk usage (>90%)
      - alert: HighDiskUsage
        expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High Disk usage on {{ \$labels.instance }}"
          description: "Disk usage on / is at {{ \$value | printf \"%.2f\" }}% on {{ \$labels.instance }}."
EOF

# 2. Create the Advanced Main Configuration File
echo "Creating advanced prometheus.yml..."
sudo tee "$MAIN_CONFIG" > /dev/null <<EOF
global:
  scrape_interval: 15s     # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

  # external_labels allows this Prometheus to be identified in a multi-Prometheus environment
  external_labels:
    monitor: 'master-monitor'

# Alerting specifies settings for Alertmanager
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - localhost:9093 # Default Alertmanager port

# rule_files specifies where to load alerting and recording rules
rule_files:
  - "rules/*.yml"

# scrape_configs defines what targets Prometheus will scrape
scrape_configs:
  # The prometheus self-monitoring job
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # Basic node_exporter job
  - job_name: 'nodes'
    scrape_interval: 5s
    static_configs:
      - targets: ['localhost:9100']

# Remote Write (Example for external storage like Grafana Cloud, VictoriaMetrics, etc.)
# remote_write:
#   - url: "https://your-remote-write-endpoint/api/v1/write"
#     basic_auth:
#       username: "your_user"
#       password: "your_password"

  # Example of a job with many targets (placeholder)
  # - job_name: 'external_nodes'
  #   static_configs:
  #     - targets:
  #       - '192.168.1.100:9100'
  #       - '192.168.1.101:9100'

EOF

# 3. Apply changes by restarting Prometheus
if [ -x "/usr/bin/restart_prometheus" ]; then
    sudo /usr/bin/restart_prometheus
else
    sudo systemctl restart prometheus
fi

echo "-------------------------------------------------------"
echo "Advanced Prometheus configuration applied successfully!"
echo "Rules directory: $RULES_DIR"
echo "Alerting rules loaded from: node_alerts.yml"
echo "Alertmanager target set to: localhost:9093"
echo "-------------------------------------------------------"
echo "Note: If you haven't installed Alertmanager yet, you will see"
echo "errors in the Prometheus logs about connecting to 9093."