monitoring: add Grafana dashboards + kube-state-metrics & node-exporter

Dashboards (provisioned via ConfigMaps into Grafana pod, 'K3s Cluster' folder):
- Cluster Overview: per-namespace CPU/mem/net/fs, pod counts, pod health (KSM)
- Pods & Services: per-pod CPU/mem/net/fs, throttling, pod status, restarts, PVCs
- Nodes: per-node CPU%/mem%, load average, disk usage, network (node-exporter)
- Control Plane & API Server: request rate, latency p95, 5xx, kubelet/PLEG
- Prometheus Self-Monitoring: ingestion, series, scrape duration, memory

Exporters (auto-scraped via existing kubernetes-service-endpoints job):
- kube-state-metrics: pod/deployment/PVC/replica state (kube_pod_status_phase,
  kube_pod_container_status_restarts_total, kube_persistentvolumeclaim_*)
- node-exporter (DaemonSet, hostNetwork): node_cpu_seconds_total,
  node_memory_*, node_filesystem_*, node_load*, node_network_*
This commit is contained in:
Roger Oriol
2026-06-26 19:34:27 +02:00
parent 2eab82b430
commit bf1387dc3e
9 changed files with 1629 additions and 0 deletions

View File

@@ -0,0 +1,279 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-nodes
namespace: monitoring
labels:
app: grafana
grafana_dashboard: "1"
data:
nodes.json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "short"
},
"overrides": []
},
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"textMode": "value_and_name"
},
"pluginVersion": "10.2.3",
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_pods", "refId": "A"}, {"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_containers", "refId": "B"}],
"title": "Pods / Containers per Node",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "orange", "value": 70}, {"color": "red", "value": 90}]},
"unit": "percent",
"min": 0,
"max": 100
},
"overrides": []
},
"gridPos": {"h": 6, "w": 18, "x": 6, "y": 0},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"textMode": "value_and_name"
},
"pluginVersion": "10.2.3",
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "{{instance}}", "refId": "A"}],
"title": "Node CPU Usage %",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "percent",
"min": 0,
"max": 100
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 6},
"id": 3,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "{{instance}}", "refId": "A"}],
"title": "Node CPU Usage % (over time)",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "percent",
"min": 0,
"max": 100
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 6},
"id": 4,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "legendFormat": "{{instance}}", "refId": "A"}],
"title": "Node Memory Usage %",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 15},
"id": 5,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(container_memory_working_set_bytes{container!=\"\",container!=\"POD\"}) by (instance)", "legendFormat": "used {{instance}}", "refId": "A"}, {"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_memory_MemTotal_bytes", "legendFormat": "total {{instance}}", "refId": "B"}],
"title": "Node Memory (used vs total)",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 15},
"id": 6,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|docker.*|br-.*|cni.*|flannel.*\"}[5m]))", "legendFormat": "RX {{instance}}", "refId": "A"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|docker.*|br-.*|cni.*|flannel.*\"}[5m]))", "legendFormat": "TX {{instance}}", "refId": "B"}
],
"title": "Node Network Traffic",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "percent",
"min": 0,
"max": 100
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 24},
"id": 7,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "(1 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"})) * 100", "legendFormat": "{{instance}} {{mountpoint}}", "refId": "A"}],
"title": "Node Disk Usage %",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "short"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 24},
"id": 8,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_load1", "legendFormat": "1m {{instance}}", "refId": "A"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_load5", "legendFormat": "5m {{instance}}", "refId": "B"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_load15", "legendFormat": "15m {{instance}}", "refId": "C"}
],
"title": "Node Load Average",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "short"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 33},
"id": 9,
"options": {
"showHeader": true,
"cellHeight": "sm",
"footer": {"show": false, "reducer": ["sum"], "countRows": false, "fields": ""}
},
"pluginVersion": "10.2.3",
"targets": [
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_pods", "format": "table", "instant": true, "refId": "A"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_containers", "format": "table", "instant": true, "refId": "B"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "format": "table", "instant": true, "refId": "C"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "format": "table", "instant": true, "refId": "D"}
],
"title": "Node Summary (live)",
"type": "table",
"transformations": [
{"id": "merge", "options": {}},
{"id": "groupBy", "options": {"fields": {"Value": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #B": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #C": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #D": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "instance": {"aggregations": [], "operation": "groupby"}}}},
{"id": "organize", "options": {"excludeByName": {"Time": true}, "renameByName": {"Value": "Pods", "Value #B": "Containers", "Value #C": "CPU %", "Value #D": "Memory %"}}}
]
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["k3s", "nodes"],
"templating": {"list": []},
"time": {"from": "now-6h", "to": "now"},
"timepicker": {},
"timezone": "",
"title": "Nodes",
"uid": "k3s-nodes",
"version": 2,
"weekStart": ""
}