2

2026-04-04 23:01:15 +08:00
parent 79779d6fcf
commit f3f49f2c8e
3 changed files with 57 additions and 17 deletions
--- a/server/index.js
+++ b/server/index.js
@@ -482,6 +482,7 @@ app.get('/api/metrics/overview', async (req, res) => {

    // Aggregate across all sources
    let totalServers = 0;
+    let activeServers = 0;
    let cpuUsed = 0, cpuTotal = 0;
    let memUsed = 0, memTotal = 0;
    let diskUsed = 0, diskTotal = 0;
@@ -491,6 +492,7 @@ app.get('/api/metrics/overview', async (req, res) => {

    for (const m of validMetrics) {
      totalServers += m.totalServers;
+      activeServers += m.activeServers || m.totalServers; // Default if missing
      cpuUsed += m.cpu.used;
      cpuTotal += m.cpu.total;
      memUsed += m.memory.used;
@@ -520,6 +522,7 @@ app.get('/api/metrics/overview', async (req, res) => {

    res.json({
      totalServers,
+      activeServers,
      cpu: {
        used: cpuUsed,
        total: cpuTotal,
--- a/server/prometheus-service.js
+++ b/server/prometheus-service.js
@@ -110,6 +110,38 @@ async function query(baseUrl, expr) {
  }
 }

+/**
+ * Get all targets from Prometheus
+ */
+async function getTargets(baseUrl) {
+  const url = normalizeUrl(baseUrl);
+  try {
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), QUERY_TIMEOUT);
+
+    const res = await fetch(`${url}/api/v1/targets`, {
+      signal: controller.signal
+    });
+
+    clearTimeout(timer);
+
+    if (!res.ok) {
+      throw new Error(`Prometheus returned HTTP ${res.status}`);
+    }
+
+    const data = await res.json();
+    if (data.status !== 'success') {
+      throw new Error(`Prometheus targets fetch failed: ${data.error || 'unknown error'}`);
+    }
+    return data.data.activeTargets || [];
+  } catch (err) {
+    if (err.name === 'AbortError') {
+      throw new Error('Prometheus targets fetch timed out');
+    }
+    throw err;
+  }
+}
+
 /**
 * Execute a Prometheus range query
 */
@@ -145,9 +177,6 @@ async function queryRange(baseUrl, expr, start, end, step) {



-/**
- * Get overview metrics from a single Prometheus source
- */
 async function getOverviewMetrics(url, sourceName) {
  // Run all queries in parallel
  const [
@@ -161,7 +190,7 @@ async function getOverviewMetrics(url, sourceName) {
    netTxResult,
    traffic24hRxResult,
    traffic24hTxResult,
-    upResult
+    targetsResult
  ] = await Promise.all([
    // CPU usage per instance: 1 - avg idle
    query(url, '100 - (avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)').catch(() => []),
@@ -183,9 +212,8 @@ async function getOverviewMetrics(url, sourceName) {
    query(url, 'sum by (instance, job) (increase(node_network_receive_bytes_total{device!~"lo|veth.*|docker.*|br-.*"}[24h]))').catch(() => []),
    // Total traffic transmitted in last 24h
    query(url, 'sum by (instance, job) (increase(node_network_transmit_bytes_total{device!~"lo|veth.*|docker.*|br-.*"}[24h]))').catch(() => []),
-    // Up instances (at least one successful scrape in last 5m)
-    // We broaden the job filter to catch more variations of node-exporter jobs
-    query(url, 'max_over_time(up{job=~".*node.*|.*exporter.*|.*host.*"}[5m])').catch(() => [])
+    // Targets status from /api/v1/targets
+    getTargets(url).catch(() => [])
  ]);

  // Build per-instance data map
@@ -223,10 +251,18 @@ async function getOverviewMetrics(url, sourceName) {
    return inst;
  };

-  // Parse UP status
-  for (const r of upResult) {
-    const inst = getOrCreate(r.metric);
-    inst.up = parseFloat(r.value[1]) === 1;
+  // Initialize instances from targets first (to ensure we have all servers even if they have no metrics)
+  const nodeJobRegex = /node|exporter|host/i;
+  for (const target of targetsResult) {
+    const labels = target.labels || {};
+    const instance = labels.instance;
+    const job = labels.job;
+    
+    // Only include targets that look like node-exporters
+    if (instance && (nodeJobRegex.test(job) || nodeJobRegex.test(target.scrapePool))) {
+      const inst = getOrCreate(labels);
+      inst.up = target.health === 'up';
+    }
  }

  // Parse CPU usage
@@ -271,15 +307,14 @@ async function getOverviewMetrics(url, sourceName) {
    inst.netTx = parseFloat(r.value[1]) || 0;
  }

-  // Final check: If an instance has non-zero CPU or Memory total data but is marked offline, 
-  // it means we missed its 'up' metric due to job labels, but it's clearly sending data.
  for (const inst of instances.values()) {
    if (!inst.up && (inst.cpuPercent > 0 || inst.memTotal > 0)) {
      inst.up = true;
    }
  }

-  const activeInstances = Array.from(instances.values()).filter(inst => inst.up);
+  const allInstancesList = Array.from(instances.values());
+  const activeInstances = allInstancesList.filter(inst => inst.up);

  // Aggregate
  let totalCpuUsed = 0, totalCpuCores = 0;
@@ -308,7 +343,8 @@ async function getOverviewMetrics(url, sourceName) {
  }

  return {
-    totalServers: activeInstances.length,
+    totalServers: allInstancesList.length,
+    activeServers: activeInstances.length,
    cpu: {
      used: totalCpuUsed,
      total: totalCpuCores,
@@ -334,7 +370,7 @@ async function getOverviewMetrics(url, sourceName) {
      tx: totalTraffic24hTx,
      total: totalTraffic24hRx + totalTraffic24hTx
    },
-    servers: activeInstances.map(s => {
+    servers: allInstancesList.map(s => {
      const { originalInstance, ...rest } = s;
      return rest;
    })
@@ -607,6 +643,7 @@ module.exports = {
  testConnection,
  query,
  queryRange,
+  getTargets,
  getOverviewMetrics,
  getNetworkHistory,
  mergeNetworkHistories,