monitoring: add Grafana dashboards + kube-state-metrics & node-exporter
Dashboards (provisioned via ConfigMaps into Grafana pod, 'K3s Cluster' folder): - Cluster Overview: per-namespace CPU/mem/net/fs, pod counts, pod health (KSM) - Pods & Services: per-pod CPU/mem/net/fs, throttling, pod status, restarts, PVCs - Nodes: per-node CPU%/mem%, load average, disk usage, network (node-exporter) - Control Plane & API Server: request rate, latency p95, 5xx, kubelet/PLEG - Prometheus Self-Monitoring: ingestion, series, scrape duration, memory Exporters (auto-scraped via existing kubernetes-service-endpoints job): - kube-state-metrics: pod/deployment/PVC/replica state (kube_pod_status_phase, kube_pod_container_status_restarts_total, kube_persistentvolumeclaim_*) - node-exporter (DaemonSet, hostNetwork): node_cpu_seconds_total, node_memory_*, node_filesystem_*, node_load*, node_network_*
This commit is contained in:
279
monitoring/grafana-dashboard-nodes.yaml
Normal file
279
monitoring/grafana-dashboard-nodes.yaml
Normal file
@@ -0,0 +1,279 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-nodes
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: grafana
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
nodes.json: |
|
||||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "10.2.3",
|
||||
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_pods", "refId": "A"}, {"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_containers", "refId": "B"}],
|
||||
"title": "Pods / Containers per Node",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "orange", "value": 70}, {"color": "red", "value": 90}]},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 6, "w": 18, "x": 6, "y": 0},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "10.2.3",
|
||||
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "{{instance}}", "refId": "A"}],
|
||||
"title": "Node CPU Usage %",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
||||
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 6},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "{{instance}}", "refId": "A"}],
|
||||
"title": "Node CPU Usage % (over time)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
||||
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 6},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "legendFormat": "{{instance}}", "refId": "A"}],
|
||||
"title": "Node Memory Usage %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
||||
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 15},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(container_memory_working_set_bytes{container!=\"\",container!=\"POD\"}) by (instance)", "legendFormat": "used {{instance}}", "refId": "A"}, {"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_memory_MemTotal_bytes", "legendFormat": "total {{instance}}", "refId": "B"}],
|
||||
"title": "Node Memory (used vs total)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
||||
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 15},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|docker.*|br-.*|cni.*|flannel.*\"}[5m]))", "legendFormat": "RX {{instance}}", "refId": "A"},
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|docker.*|br-.*|cni.*|flannel.*\"}[5m]))", "legendFormat": "TX {{instance}}", "refId": "B"}
|
||||
],
|
||||
"title": "Node Network Traffic",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
||||
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 24},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "(1 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"})) * 100", "legendFormat": "{{instance}} {{mountpoint}}", "refId": "A"}],
|
||||
"title": "Node Disk Usage %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
||||
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 24},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_load1", "legendFormat": "1m {{instance}}", "refId": "A"},
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_load5", "legendFormat": "5m {{instance}}", "refId": "B"},
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "node_load15", "legendFormat": "15m {{instance}}", "refId": "C"}
|
||||
],
|
||||
"title": "Node Load Average",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 33},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"cellHeight": "sm",
|
||||
"footer": {"show": false, "reducer": ["sum"], "countRows": false, "fields": ""}
|
||||
},
|
||||
"pluginVersion": "10.2.3",
|
||||
"targets": [
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_pods", "format": "table", "instant": true, "refId": "A"},
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kubelet_running_containers", "format": "table", "instant": true, "refId": "B"},
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "format": "table", "instant": true, "refId": "C"},
|
||||
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "format": "table", "instant": true, "refId": "D"}
|
||||
],
|
||||
"title": "Node Summary (live)",
|
||||
"type": "table",
|
||||
"transformations": [
|
||||
{"id": "merge", "options": {}},
|
||||
{"id": "groupBy", "options": {"fields": {"Value": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #B": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #C": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #D": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "instance": {"aggregations": [], "operation": "groupby"}}}},
|
||||
{"id": "organize", "options": {"excludeByName": {"Time": true}, "renameByName": {"Value": "Pods", "Value #B": "Containers", "Value #C": "CPU %", "Value #D": "Memory %"}}}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["k3s", "nodes"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-6h", "to": "now"},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Nodes",
|
||||
"uid": "k3s-nodes",
|
||||
"version": 2,
|
||||
"weekStart": ""
|
||||
}
|
||||
Reference in New Issue
Block a user