2
This commit is contained in:
@@ -482,6 +482,7 @@ app.get('/api/metrics/overview', async (req, res) => {
|
||||
|
||||
// Aggregate across all sources
|
||||
let totalServers = 0;
|
||||
let activeServers = 0;
|
||||
let cpuUsed = 0, cpuTotal = 0;
|
||||
let memUsed = 0, memTotal = 0;
|
||||
let diskUsed = 0, diskTotal = 0;
|
||||
@@ -491,6 +492,7 @@ app.get('/api/metrics/overview', async (req, res) => {
|
||||
|
||||
for (const m of validMetrics) {
|
||||
totalServers += m.totalServers;
|
||||
activeServers += m.activeServers || m.totalServers; // Default if missing
|
||||
cpuUsed += m.cpu.used;
|
||||
cpuTotal += m.cpu.total;
|
||||
memUsed += m.memory.used;
|
||||
@@ -520,6 +522,7 @@ app.get('/api/metrics/overview', async (req, res) => {
|
||||
|
||||
res.json({
|
||||
totalServers,
|
||||
activeServers,
|
||||
cpu: {
|
||||
used: cpuUsed,
|
||||
total: cpuTotal,
|
||||
|
||||
@@ -110,6 +110,38 @@ async function query(baseUrl, expr) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all targets from Prometheus
|
||||
*/
|
||||
async function getTargets(baseUrl) {
|
||||
const url = normalizeUrl(baseUrl);
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), QUERY_TIMEOUT);
|
||||
|
||||
const res = await fetch(`${url}/api/v1/targets`, {
|
||||
signal: controller.signal
|
||||
});
|
||||
|
||||
clearTimeout(timer);
|
||||
|
||||
if (!res.ok) {
|
||||
throw new Error(`Prometheus returned HTTP ${res.status}`);
|
||||
}
|
||||
|
||||
const data = await res.json();
|
||||
if (data.status !== 'success') {
|
||||
throw new Error(`Prometheus targets fetch failed: ${data.error || 'unknown error'}`);
|
||||
}
|
||||
return data.data.activeTargets || [];
|
||||
} catch (err) {
|
||||
if (err.name === 'AbortError') {
|
||||
throw new Error('Prometheus targets fetch timed out');
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a Prometheus range query
|
||||
*/
|
||||
@@ -145,9 +177,6 @@ async function queryRange(baseUrl, expr, start, end, step) {
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Get overview metrics from a single Prometheus source
|
||||
*/
|
||||
async function getOverviewMetrics(url, sourceName) {
|
||||
// Run all queries in parallel
|
||||
const [
|
||||
@@ -161,7 +190,7 @@ async function getOverviewMetrics(url, sourceName) {
|
||||
netTxResult,
|
||||
traffic24hRxResult,
|
||||
traffic24hTxResult,
|
||||
upResult
|
||||
targetsResult
|
||||
] = await Promise.all([
|
||||
// CPU usage per instance: 1 - avg idle
|
||||
query(url, '100 - (avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)').catch(() => []),
|
||||
@@ -183,9 +212,8 @@ async function getOverviewMetrics(url, sourceName) {
|
||||
query(url, 'sum by (instance, job) (increase(node_network_receive_bytes_total{device!~"lo|veth.*|docker.*|br-.*"}[24h]))').catch(() => []),
|
||||
// Total traffic transmitted in last 24h
|
||||
query(url, 'sum by (instance, job) (increase(node_network_transmit_bytes_total{device!~"lo|veth.*|docker.*|br-.*"}[24h]))').catch(() => []),
|
||||
// Up instances (at least one successful scrape in last 5m)
|
||||
// We broaden the job filter to catch more variations of node-exporter jobs
|
||||
query(url, 'max_over_time(up{job=~".*node.*|.*exporter.*|.*host.*"}[5m])').catch(() => [])
|
||||
// Targets status from /api/v1/targets
|
||||
getTargets(url).catch(() => [])
|
||||
]);
|
||||
|
||||
// Build per-instance data map
|
||||
@@ -223,10 +251,18 @@ async function getOverviewMetrics(url, sourceName) {
|
||||
return inst;
|
||||
};
|
||||
|
||||
// Parse UP status
|
||||
for (const r of upResult) {
|
||||
const inst = getOrCreate(r.metric);
|
||||
inst.up = parseFloat(r.value[1]) === 1;
|
||||
// Initialize instances from targets first (to ensure we have all servers even if they have no metrics)
|
||||
const nodeJobRegex = /node|exporter|host/i;
|
||||
for (const target of targetsResult) {
|
||||
const labels = target.labels || {};
|
||||
const instance = labels.instance;
|
||||
const job = labels.job;
|
||||
|
||||
// Only include targets that look like node-exporters
|
||||
if (instance && (nodeJobRegex.test(job) || nodeJobRegex.test(target.scrapePool))) {
|
||||
const inst = getOrCreate(labels);
|
||||
inst.up = target.health === 'up';
|
||||
}
|
||||
}
|
||||
|
||||
// Parse CPU usage
|
||||
@@ -271,15 +307,14 @@ async function getOverviewMetrics(url, sourceName) {
|
||||
inst.netTx = parseFloat(r.value[1]) || 0;
|
||||
}
|
||||
|
||||
// Final check: If an instance has non-zero CPU or Memory total data but is marked offline,
|
||||
// it means we missed its 'up' metric due to job labels, but it's clearly sending data.
|
||||
for (const inst of instances.values()) {
|
||||
if (!inst.up && (inst.cpuPercent > 0 || inst.memTotal > 0)) {
|
||||
inst.up = true;
|
||||
}
|
||||
}
|
||||
|
||||
const activeInstances = Array.from(instances.values()).filter(inst => inst.up);
|
||||
const allInstancesList = Array.from(instances.values());
|
||||
const activeInstances = allInstancesList.filter(inst => inst.up);
|
||||
|
||||
// Aggregate
|
||||
let totalCpuUsed = 0, totalCpuCores = 0;
|
||||
@@ -308,7 +343,8 @@ async function getOverviewMetrics(url, sourceName) {
|
||||
}
|
||||
|
||||
return {
|
||||
totalServers: activeInstances.length,
|
||||
totalServers: allInstancesList.length,
|
||||
activeServers: activeInstances.length,
|
||||
cpu: {
|
||||
used: totalCpuUsed,
|
||||
total: totalCpuCores,
|
||||
@@ -334,7 +370,7 @@ async function getOverviewMetrics(url, sourceName) {
|
||||
tx: totalTraffic24hTx,
|
||||
total: totalTraffic24hRx + totalTraffic24hTx
|
||||
},
|
||||
servers: activeInstances.map(s => {
|
||||
servers: allInstancesList.map(s => {
|
||||
const { originalInstance, ...rest } = s;
|
||||
return rest;
|
||||
})
|
||||
@@ -607,6 +643,7 @@ module.exports = {
|
||||
testConnection,
|
||||
query,
|
||||
queryRange,
|
||||
getTargets,
|
||||
getOverviewMetrics,
|
||||
getNetworkHistory,
|
||||
mergeNetworkHistories,
|
||||
|
||||
Reference in New Issue
Block a user