Calico Monitoring
Calico Rules
Below is the YAML-based rule set that Catapult uses, including alert names, expression, timeframe, labels, and annotations which contain the description and summaries.
x
bash-5.0# cat calico.yml
groups
name calico.rules
rules
#----------------------------------- Calico Node -------------------------------------
# Felix Prometheus statistics: https://projectcalico.docs.tigera.io/reference/felix/prometheus
alert PromHTTPRequestErrors
expr (sum(rate(promhttp_metric_handler_requests_total job="calico-node" code=~"(4|5).." 5m offset 5m )) by (instance, job, cluster, host) / sum(rate(promhttp_metric_handler_requests_total job="calico-node" 5m offset 5m )) by (instance, job, cluster, host)) * 100 > 1
for 10m
labels
severity warning
type calico-node
annotations
description"Cluster {{ $labels.cluster }}: HTTP requests errors on host {{ $labels.host }}."
summary Calico HTTP requests errors on cluster $labels.cluster
alert CalicoDatapaneFailuresHigh
expr increase(felix_int_dataplane_failures 1h offset 5m) > 5
for 1h
labels
severity warning
type calico-node
annotations
description'Felix cluster {{ $labels.cluster }} has seen {{ $value }} dataplane failures within the last hour'
summary'A high number of dataplane failures within Felix are happening'
alert CalicoIpsetErrorsHigh
expr increase(felix_ipset_errors 1h offset 5m) > 5
for 1h
labels
severity warning
type calico-node
annotations
description'Felix cluster {{ $labels.cluster }} has seen {{ $value }} ipset errors within the last hour'
summary'A high number of ipset errors within Felix are happening'
alert CalicoIptableSaveErrorsHigh
expr increase(felix_iptables_save_errors 1h offset 5m) > 5
for 1h
labels
severity warning
type calico-node
annotations
description'Felix cluster {{ $labels.cluster }} has seen {{ $value }} iptable save errors within the last hour'
summary'A high number of iptable save errors within Felix are happening'
alert CalicoIptableRestoreErrorsHigh
expr increase(felix_iptables_restore_errors 1h offset 5m) > 5
for 1h
labels
severity warning
type calico-node
annotations
description'Felix cluster {{ $labels.cluster }} has seen {{ $value }} iptable restore errors within the last hour'
summary'A high number of iptable restore errors within Felix are happening'
#----------------------------------- Calico Kube Controllers -----------------------
# kube-controllers Prometheus statistics: https://projectcalico.docs.tigera.io/reference/kube-controllers/prometheus
# curl http://<pod_ip>:9094/metrics
#----------------------------------- Calico Typha ----------------------------------
# Typha Prometheus statistics: https://projectcalico.docs.tigera.io/reference/typha/prometheus
alert TyphaPingLatency
expr rate(typha_ping_latency_sum 1m offset 5m ) / rate(typha_ping_latency_count 1m offset 5m ) > 0.1 and rate(typha_ping_latency_count 1m offset 5m ) > 0
for 2m
labels
severity warning
type calico-typha
annotations
summary Typha Round-trip ping latency to client (cluster $labels.cluster )
description"Typha latency is growing (ping operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
alert TyphaClientWriteLatency
expr rate(typha_client_write_latency_secs_sum 1m offset 5m) / rate(typha_client_write_latency_secs_count 1m offset 5m) > 0.1 and rate(typha_client_write_latency_secs_count 1m offset 5m ) > 0
for 2m
labels
severity warning
type calico-typha
annotations
summary Typha unusual write latency (instance $labels.cluster )
description"Typha client latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
bash-5.0#
Was this page helpful?