kube-prometheus

Signed-off-by: Martyn Ranyard <m@rtyn.berlin>
This commit is contained in:
Martyn 2024-02-15 19:16:38 +01:00
parent 6e7f842efb
commit 4ce267de84
96 changed files with 66613 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,702 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.1
argocd.argoproj.io/sync-wave: '0'
creationTimestamp: null
name: podmonitors.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: PodMonitor
listKind: PodMonitorList
plural: podmonitors
shortNames:
- pmon
singular: podmonitor
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: PodMonitor defines monitoring for a set of pods.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource
this object represents. Servers may infer this from the endpoint the
client submits requests to. Cannot be updated. In CamelCase. More
info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
description: Specification of desired Pod selection for target discovery
by Prometheus.
properties:
attachMetadata:
description: Attaches node metadata to discovered targets. Requires
Prometheus v2.35.0 and above.
properties:
node:
description: When set to true, Prometheus must have permissions
to get Nodes.
type: boolean
type: object
jobLabel:
description: The label to use to retrieve the job name from.
type: string
labelLimit:
description: Per-scrape limit on number of labels that will be accepted
for a sample. Only valid in Prometheus versions 2.27.0 and newer.
format: int64
type: integer
labelNameLengthLimit:
description: Per-scrape limit on length of labels name that will
be accepted for a sample. Only valid in Prometheus versions 2.27.0
and newer.
format: int64
type: integer
labelValueLengthLimit:
description: Per-scrape limit on length of labels value that will
be accepted for a sample. Only valid in Prometheus versions 2.27.0
and newer.
format: int64
type: integer
namespaceSelector:
description: Selector to select which namespaces the Endpoints objects
are discovered from.
properties:
any:
description: Boolean describing whether all namespaces are selected
in contrast to a list restricting them.
type: boolean
matchNames:
description: List of namespace names to select from.
items:
type: string
type: array
type: object
podMetricsEndpoints:
description: A list of endpoints allowed as part of this PodMonitor.
items:
description: PodMetricsEndpoint defines a scrapeable endpoint
of a Kubernetes Pod serving Prometheus metrics.
properties:
authorization:
description: Authorization section for this endpoint
properties:
credentials:
description: Selects a key of a Secret in the namespace
that contains the credentials for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type:
description: "Defines the authentication type. The value\
\ is case-insensitive. \n \"Basic\" is not a supported\
\ value. \n Default: \"Bearer\""
type: string
type: object
basicAuth:
description: 'BasicAuth allow an endpoint to authenticate
over basic authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint'
properties:
password:
description: The secret in the service monitor namespace
that contains the password for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
username:
description: The secret in the service monitor namespace
that contains the username for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
bearerTokenSecret:
description: Secret to mount to read bearer token for scraping
targets. The secret needs to be in the same namespace as
the pod monitor and accessible by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
enableHttp2:
description: Whether to enable HTTP2.
type: boolean
filterRunning:
description: 'Drop pods that are not running. (Failed, Succeeded).
Enabled by default. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase'
type: boolean
followRedirects:
description: FollowRedirects configures whether scrape requests
follow HTTP 3xx redirects.
type: boolean
honorLabels:
description: HonorLabels chooses the metric's labels on collisions
with target labels.
type: boolean
honorTimestamps:
description: HonorTimestamps controls whether Prometheus respects
the timestamps present in scraped data.
type: boolean
interval:
description: Interval at which metrics should be scraped If
not specified Prometheus' global scrape interval is used.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
metricRelabelings:
description: MetricRelabelConfigs to apply to samples before
ingestion.
items:
description: "RelabelConfig allows dynamic rewriting of\
\ the label set for targets, alerts, scraped samples and\
\ remote write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex matching.\
\ \n `Uppercase` and `Lowercase` actions require Prometheus\
\ >= v2.36.0. `DropEqual` and `KeepEqual` actions\
\ require Prometheus >= v2.41.0. \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source\
\ label values. \n Only applicable when the action\
\ is `HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: "Replacement value against which a Replace\
\ action is performed if the regular expression matches.\
\ \n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
Separator and matched against the configured regular
expression.
items:
description: LabelName is a valid Prometheus label
name which may only contain ASCII letters, numbers,
as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string is\
\ written in a replacement. \n It is mandatory for\
\ `Replace`, `HashMod`, `Lowercase`, `Uppercase`,\
\ `KeepEqual` and `DropEqual` actions. \n Regex capture\
\ groups are available."
type: string
type: object
type: array
oauth2:
description: OAuth2 for the URL. Only valid in Prometheus
versions 2.27.0 and newer.
properties:
clientId:
description: The secret or configmap containing the OAuth2
client id
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
clientSecret:
description: The secret containing the OAuth2 client secret
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
endpointParams:
additionalProperties:
type: string
description: Parameters to append to the token URL
type: object
scopes:
description: OAuth2 scopes used for the token request
items:
type: string
type: array
tokenUrl:
description: The URL to fetch the token from
minLength: 1
type: string
required:
- clientId
- clientSecret
- tokenUrl
type: object
params:
additionalProperties:
items:
type: string
type: array
description: Optional HTTP URL parameters
type: object
path:
description: HTTP path to scrape for metrics. If empty, Prometheus
uses the default value (e.g. `/metrics`).
type: string
port:
description: Name of the pod port this endpoint refers to.
Mutually exclusive with targetPort.
type: string
proxyUrl:
description: ProxyURL eg http://proxyserver:2195 Directs scrapes
to proxy through this endpoint.
type: string
relabelings:
description: 'RelabelConfigs to apply to samples before scraping.
Prometheus Operator automatically adds relabelings for a
few standard Kubernetes fields. The original scrape job''s
name is available via the `__tmp_prometheus_job_name` label.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: "RelabelConfig allows dynamic rewriting of\
\ the label set for targets, alerts, scraped samples and\
\ remote write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex matching.\
\ \n `Uppercase` and `Lowercase` actions require Prometheus\
\ >= v2.36.0. `DropEqual` and `KeepEqual` actions\
\ require Prometheus >= v2.41.0. \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source\
\ label values. \n Only applicable when the action\
\ is `HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: "Replacement value against which a Replace\
\ action is performed if the regular expression matches.\
\ \n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
Separator and matched against the configured regular
expression.
items:
description: LabelName is a valid Prometheus label
name which may only contain ASCII letters, numbers,
as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string is\
\ written in a replacement. \n It is mandatory for\
\ `Replace`, `HashMod`, `Lowercase`, `Uppercase`,\
\ `KeepEqual` and `DropEqual` actions. \n Regex capture\
\ groups are available."
type: string
type: object
type: array
scheme:
description: HTTP scheme to use for scraping. `http` and `https`
are the expected values unless you rewrite the `__scheme__`
label via relabeling. If empty, Prometheus uses the default
value `http`.
enum:
- http
- https
type: string
scrapeTimeout:
description: Timeout after which the scrape is ended If not
specified, the Prometheus global scrape interval is used.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
targetPort:
anyOf:
- type: integer
- type: string
description: 'Deprecated: Use ''port'' instead.'
x-kubernetes-int-or-string: true
tlsConfig:
description: TLS configuration to use when scraping the endpoint.
properties:
ca:
description: Certificate authority used when verifying
server certificates.
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
cert:
description: Client certificate to present when doing
client-authentication.
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keySecret:
description: Secret containing the client key file for
the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
type: object
type: array
podTargetLabels:
description: PodTargetLabels transfers labels on the Kubernetes
Pod onto the target.
items:
type: string
type: array
sampleLimit:
description: SampleLimit defines per-scrape limit on number of scraped
samples that will be accepted.
format: int64
type: integer
selector:
description: Selector to select Pod objects.
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
The requirements are ANDed.
items:
description: A label selector requirement is a selector that
contains values, a key, and an operator that relates the
key and values.
properties:
key:
description: key is the label key that the selector applies
to.
type: string
operator:
description: operator represents a key's relationship
to a set of values. Valid operators are In, NotIn, Exists
and DoesNotExist.
type: string
values:
description: values is an array of string values. If the
operator is In or NotIn, the values array must be non-empty.
If the operator is Exists or DoesNotExist, the values
array must be empty. This array is replaced during a
strategic merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator
is "In", and the values array contains only "value". The requirements
are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
targetLimit:
description: TargetLimit defines a limit on the number of scraped
targets that will be accepted.
format: int64
type: integer
required:
- podMetricsEndpoints
- selector
type: object
required:
- spec
type: object
served: true
storage: true

View File

@ -0,0 +1,736 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.1
argocd.argoproj.io/sync-wave: '0'
creationTimestamp: null
name: probes.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: Probe
listKind: ProbeList
plural: probes
shortNames:
- prb
singular: probe
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: Probe defines monitoring for a set of static targets or ingresses.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource
this object represents. Servers may infer this from the endpoint the
client submits requests to. Cannot be updated. In CamelCase. More
info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
description: Specification of desired Ingress selection for target discovery
by Prometheus.
properties:
authorization:
description: Authorization section for this endpoint
properties:
credentials:
description: Selects a key of a Secret in the namespace that
contains the credentials for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type:
description: "Defines the authentication type. The value is\
\ case-insensitive. \n \"Basic\" is not a supported value.\
\ \n Default: \"Bearer\""
type: string
type: object
basicAuth:
description: 'BasicAuth allow an endpoint to authenticate over basic
authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint'
properties:
password:
description: The secret in the service monitor namespace that
contains the password for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
username:
description: The secret in the service monitor namespace that
contains the username for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
bearerTokenSecret:
description: Secret to mount to read bearer token for scraping targets.
The secret needs to be in the same namespace as the probe and
accessible by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
interval:
description: Interval at which targets are probed using the configured
prober. If not specified Prometheus' global scrape interval is
used.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
jobName:
description: The job name assigned to scraped metrics by default.
type: string
labelLimit:
description: Per-scrape limit on number of labels that will be accepted
for a sample. Only valid in Prometheus versions 2.27.0 and newer.
format: int64
type: integer
labelNameLengthLimit:
description: Per-scrape limit on length of labels name that will
be accepted for a sample. Only valid in Prometheus versions 2.27.0
and newer.
format: int64
type: integer
labelValueLengthLimit:
description: Per-scrape limit on length of labels value that will
be accepted for a sample. Only valid in Prometheus versions 2.27.0
and newer.
format: int64
type: integer
metricRelabelings:
description: MetricRelabelConfigs to apply to samples before ingestion.
items:
description: "RelabelConfig allows dynamic rewriting of the label\
\ set for targets, alerts, scraped samples and remote write\
\ samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex matching.\
\ \n `Uppercase` and `Lowercase` actions require Prometheus\
\ >= v2.36.0. `DropEqual` and `KeepEqual` actions require\
\ Prometheus >= v2.41.0. \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source label\
\ values. \n Only applicable when the action is `HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: "Replacement value against which a Replace action\
\ is performed if the regular expression matches. \n Regex\
\ capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
Separator and matched against the configured regular expression.
items:
description: LabelName is a valid Prometheus label name
which may only contain ASCII letters, numbers, as well
as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string is written\
\ in a replacement. \n It is mandatory for `Replace`, `HashMod`,\
\ `Lowercase`, `Uppercase`, `KeepEqual` and `DropEqual`\
\ actions. \n Regex capture groups are available."
type: string
type: object
type: array
module:
description: 'The module to use for probing specifying how to probe
the target. Example module configuring in the blackbox exporter:
https://github.com/prometheus/blackbox_exporter/blob/master/example.yml'
type: string
oauth2:
description: OAuth2 for the URL. Only valid in Prometheus versions
2.27.0 and newer.
properties:
clientId:
description: The secret or configmap containing the OAuth2 client
id
properties:
configMap:
description: ConfigMap containing data to use for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
clientSecret:
description: The secret containing the OAuth2 client secret
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
endpointParams:
additionalProperties:
type: string
description: Parameters to append to the token URL
type: object
scopes:
description: OAuth2 scopes used for the token request
items:
type: string
type: array
tokenUrl:
description: The URL to fetch the token from
minLength: 1
type: string
required:
- clientId
- clientSecret
- tokenUrl
type: object
prober:
description: Specification for the prober to use for probing targets.
The prober.URL parameter is required. Targets cannot be probed
if left empty.
properties:
path:
default: /probe
description: Path to collect metrics from. Defaults to `/probe`.
type: string
proxyUrl:
description: Optional ProxyURL.
type: string
scheme:
description: HTTP scheme to use for scraping. `http` and `https`
are the expected values unless you rewrite the `__scheme__`
label via relabeling. If empty, Prometheus uses the default
value `http`.
enum:
- http
- https
type: string
url:
description: Mandatory URL of the prober.
type: string
required:
- url
type: object
sampleLimit:
description: SampleLimit defines per-scrape limit on number of scraped
samples that will be accepted.
format: int64
type: integer
scrapeTimeout:
description: Timeout for scraping metrics from the Prometheus exporter.
If not specified, the Prometheus global scrape timeout is used.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
targetLimit:
description: TargetLimit defines a limit on the number of scraped
targets that will be accepted.
format: int64
type: integer
targets:
description: Targets defines a set of static or dynamically discovered
targets to probe.
properties:
ingress:
description: ingress defines the Ingress objects to probe and
the relabeling configuration. If `staticConfig` is also defined,
`staticConfig` takes precedence.
properties:
namespaceSelector:
description: From which namespaces to select Ingress objects.
properties:
any:
description: Boolean describing whether all namespaces
are selected in contrast to a list restricting them.
type: boolean
matchNames:
description: List of namespace names to select from.
items:
type: string
type: array
type: object
relabelingConfigs:
description: 'RelabelConfigs to apply to the label set of
the target before it gets scraped. The original ingress
address is available via the `__tmp_prometheus_ingress_address`
label. It can be used to customize the probed URL. The
original scrape job''s name is available via the `__tmp_prometheus_job_name`
label. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: "RelabelConfig allows dynamic rewriting of\
\ the label set for targets, alerts, scraped samples\
\ and remote write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex\
\ matching. \n `Uppercase` and `Lowercase` actions\
\ require Prometheus >= v2.36.0. `DropEqual` and\
\ `KeepEqual` actions require Prometheus >= v2.41.0.\
\ \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source\
\ label values. \n Only applicable when the action\
\ is `HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the
extracted value is matched.
type: string
replacement:
description: "Replacement value against which a Replace\
\ action is performed if the regular expression\
\ matches. \n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from
existing labels. Their content is concatenated using
the configured Separator and matched against the
configured regular expression.
items:
description: LabelName is a valid Prometheus label
name which may only contain ASCII letters, numbers,
as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string\
\ is written in a replacement. \n It is mandatory\
\ for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,\
\ `KeepEqual` and `DropEqual` actions. \n Regex\
\ capture groups are available."
type: string
type: object
type: array
selector:
description: Selector to select the Ingress objects.
properties:
matchExpressions:
description: matchExpressions is a list of label selector
requirements. The requirements are ANDed.
items:
description: A label selector requirement is a selector
that contains values, a key, and an operator that
relates the key and values.
properties:
key:
description: key is the label key that the selector
applies to.
type: string
operator:
description: operator represents a key's relationship
to a set of values. Valid operators are In,
NotIn, Exists and DoesNotExist.
type: string
values:
description: values is an array of string values.
If the operator is In or NotIn, the values array
must be non-empty. If the operator is Exists
or DoesNotExist, the values array must be empty.
This array is replaced during a strategic merge
patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs.
A single {key,value} in the matchLabels map is equivalent
to an element of matchExpressions, whose key field
is "key", the operator is "In", and the values array
contains only "value". The requirements are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
type: object
staticConfig:
description: 'staticConfig defines the static list of targets
to probe and the relabeling configuration. If `ingress` is
also defined, `staticConfig` takes precedence. More info:
https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config.'
properties:
labels:
additionalProperties:
type: string
description: Labels assigned to all metrics scraped from
the targets.
type: object
relabelingConfigs:
description: 'RelabelConfigs to apply to the label set of
the targets before it gets scraped. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: "RelabelConfig allows dynamic rewriting of\
\ the label set for targets, alerts, scraped samples\
\ and remote write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex\
\ matching. \n `Uppercase` and `Lowercase` actions\
\ require Prometheus >= v2.36.0. `DropEqual` and\
\ `KeepEqual` actions require Prometheus >= v2.41.0.\
\ \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source\
\ label values. \n Only applicable when the action\
\ is `HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the
extracted value is matched.
type: string
replacement:
description: "Replacement value against which a Replace\
\ action is performed if the regular expression\
\ matches. \n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from
existing labels. Their content is concatenated using
the configured Separator and matched against the
configured regular expression.
items:
description: LabelName is a valid Prometheus label
name which may only contain ASCII letters, numbers,
as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string\
\ is written in a replacement. \n It is mandatory\
\ for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,\
\ `KeepEqual` and `DropEqual` actions. \n Regex\
\ capture groups are available."
type: string
type: object
type: array
static:
description: The list of hosts to probe.
items:
type: string
type: array
type: object
type: object
tlsConfig:
description: TLS configuration to use when scraping the endpoint.
properties:
ca:
description: Certificate authority used when verifying server
certificates.
properties:
configMap:
description: ConfigMap containing data to use for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
cert:
description: Client certificate to present when doing client-authentication.
properties:
configMap:
description: ConfigMap containing data to use for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keySecret:
description: Secret containing the client key file for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
type: object
required:
- spec
type: object
served: true
storage: true

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,133 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.1
argocd.argoproj.io/sync-wave: '0'
creationTimestamp: null
name: prometheusrules.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: PrometheusRule
listKind: PrometheusRuleList
plural: prometheusrules
shortNames:
- promrule
singular: prometheusrule
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: PrometheusRule defines recording and alerting rules for a Prometheus
instance
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource
this object represents. Servers may infer this from the endpoint the
client submits requests to. Cannot be updated. In CamelCase. More
info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
description: Specification of desired alerting rule definitions for
Prometheus.
properties:
groups:
description: Content of Prometheus rule file
items:
description: RuleGroup is a list of sequentially evaluated recording
and alerting rules.
properties:
interval:
description: Interval determines how often rules in the group
are evaluated.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
limit:
description: Limit the number of alerts an alerting rule and
series a recording rule can produce. Limit is supported
starting with Prometheus >= 2.31 and Thanos Ruler >= 0.24.
type: integer
name:
description: Name of the rule group.
minLength: 1
type: string
partial_response_strategy:
description: 'PartialResponseStrategy is only used by ThanosRuler
and will be ignored by Prometheus instances. More info:
https://github.com/thanos-io/thanos/blob/main/docs/components/rule.md#partial-response'
pattern: ^(?i)(abort|warn)?$
type: string
rules:
description: List of alerting and recording rules.
items:
description: 'Rule describes an alerting or recording rule
See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules)
rule'
properties:
alert:
description: Name of the alert. Must be a valid label
value. Only one of `record` and `alert` must be set.
type: string
annotations:
additionalProperties:
type: string
description: Annotations to add to each alert. Only
valid for alerting rules.
type: object
expr:
anyOf:
- type: integer
- type: string
description: PromQL expression to evaluate.
x-kubernetes-int-or-string: true
for:
description: Alerts are considered firing once they
have been returned for this long.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
keep_firing_for:
description: KeepFiringFor defines how long an alert
will continue firing after the condition that triggered
it has cleared.
minLength: 1
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
labels:
additionalProperties:
type: string
description: Labels to add or overwrite.
type: object
record:
description: Name of the time series to output to. Must
be a valid metric name. Only one of `record` and `alert`
must be set.
type: string
required:
- expr
type: object
type: array
required:
- name
type: object
type: array
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
type: object
required:
- spec
type: object
served: true
storage: true

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,734 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.1
argocd.argoproj.io/sync-wave: '0'
creationTimestamp: null
name: servicemonitors.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: ServiceMonitor
listKind: ServiceMonitorList
plural: servicemonitors
shortNames:
- smon
singular: servicemonitor
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: ServiceMonitor defines monitoring for a set of services.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource
this object represents. Servers may infer this from the endpoint the
client submits requests to. Cannot be updated. In CamelCase. More
info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
description: Specification of desired Service selection for target discovery
by Prometheus.
properties:
attachMetadata:
description: Attaches node metadata to discovered targets. Requires
Prometheus v2.37.0 and above.
properties:
node:
description: When set to true, Prometheus must have permissions
to get Nodes.
type: boolean
type: object
endpoints:
description: A list of endpoints allowed as part of this ServiceMonitor.
items:
description: Endpoint defines a scrapeable endpoint serving Prometheus
metrics.
properties:
authorization:
description: Authorization section for this endpoint
properties:
credentials:
description: Selects a key of a Secret in the namespace
that contains the credentials for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type:
description: "Defines the authentication type. The value\
\ is case-insensitive. \n \"Basic\" is not a supported\
\ value. \n Default: \"Bearer\""
type: string
type: object
basicAuth:
description: 'BasicAuth allow an endpoint to authenticate
over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints'
properties:
password:
description: The secret in the service monitor namespace
that contains the password for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
username:
description: The secret in the service monitor namespace
that contains the username for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
bearerTokenFile:
description: File to read bearer token for scraping targets.
type: string
bearerTokenSecret:
description: Secret to mount to read bearer token for scraping
targets. The secret needs to be in the same namespace as
the service monitor and accessible by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
enableHttp2:
description: Whether to enable HTTP2.
type: boolean
filterRunning:
description: 'Drop pods that are not running. (Failed, Succeeded).
Enabled by default. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase'
type: boolean
followRedirects:
description: FollowRedirects configures whether scrape requests
follow HTTP 3xx redirects.
type: boolean
honorLabels:
description: HonorLabels chooses the metric's labels on collisions
with target labels.
type: boolean
honorTimestamps:
description: HonorTimestamps controls whether Prometheus respects
the timestamps present in scraped data.
type: boolean
interval:
description: Interval at which metrics should be scraped If
not specified Prometheus' global scrape interval is used.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
metricRelabelings:
description: MetricRelabelConfigs to apply to samples before
ingestion.
items:
description: "RelabelConfig allows dynamic rewriting of\
\ the label set for targets, alerts, scraped samples and\
\ remote write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex matching.\
\ \n `Uppercase` and `Lowercase` actions require Prometheus\
\ >= v2.36.0. `DropEqual` and `KeepEqual` actions\
\ require Prometheus >= v2.41.0. \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source\
\ label values. \n Only applicable when the action\
\ is `HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: "Replacement value against which a Replace\
\ action is performed if the regular expression matches.\
\ \n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
Separator and matched against the configured regular
expression.
items:
description: LabelName is a valid Prometheus label
name which may only contain ASCII letters, numbers,
as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string is\
\ written in a replacement. \n It is mandatory for\
\ `Replace`, `HashMod`, `Lowercase`, `Uppercase`,\
\ `KeepEqual` and `DropEqual` actions. \n Regex capture\
\ groups are available."
type: string
type: object
type: array
oauth2:
description: OAuth2 for the URL. Only valid in Prometheus
versions 2.27.0 and newer.
properties:
clientId:
description: The secret or configmap containing the OAuth2
client id
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
clientSecret:
description: The secret containing the OAuth2 client secret
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
endpointParams:
additionalProperties:
type: string
description: Parameters to append to the token URL
type: object
scopes:
description: OAuth2 scopes used for the token request
items:
type: string
type: array
tokenUrl:
description: The URL to fetch the token from
minLength: 1
type: string
required:
- clientId
- clientSecret
- tokenUrl
type: object
params:
additionalProperties:
items:
type: string
type: array
description: Optional HTTP URL parameters
type: object
path:
description: HTTP path to scrape for metrics. If empty, Prometheus
uses the default value (e.g. `/metrics`).
type: string
port:
description: Name of the service port this endpoint refers
to. Mutually exclusive with targetPort.
type: string
proxyUrl:
description: ProxyURL eg http://proxyserver:2195 Directs scrapes
to proxy through this endpoint.
type: string
relabelings:
description: 'RelabelConfigs to apply to samples before scraping.
Prometheus Operator automatically adds relabelings for a
few standard Kubernetes fields. The original scrape job''s
name is available via the `__tmp_prometheus_job_name` label.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: "RelabelConfig allows dynamic rewriting of\
\ the label set for targets, alerts, scraped samples and\
\ remote write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex matching.\
\ \n `Uppercase` and `Lowercase` actions require Prometheus\
\ >= v2.36.0. `DropEqual` and `KeepEqual` actions\
\ require Prometheus >= v2.41.0. \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source\
\ label values. \n Only applicable when the action\
\ is `HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: "Replacement value against which a Replace\
\ action is performed if the regular expression matches.\
\ \n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
Separator and matched against the configured regular
expression.
items:
description: LabelName is a valid Prometheus label
name which may only contain ASCII letters, numbers,
as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string is\
\ written in a replacement. \n It is mandatory for\
\ `Replace`, `HashMod`, `Lowercase`, `Uppercase`,\
\ `KeepEqual` and `DropEqual` actions. \n Regex capture\
\ groups are available."
type: string
type: object
type: array
scheme:
description: HTTP scheme to use for scraping. `http` and `https`
are the expected values unless you rewrite the `__scheme__`
label via relabeling. If empty, Prometheus uses the default
value `http`.
enum:
- http
- https
type: string
scrapeTimeout:
description: Timeout after which the scrape is ended If not
specified, the Prometheus global scrape timeout is used
unless it is less than `Interval` in which the latter is
used.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
targetPort:
anyOf:
- type: integer
- type: string
description: Name or number of the target port of the Pod
behind the Service, the port must be specified with container
port property. Mutually exclusive with port.
x-kubernetes-int-or-string: true
tlsConfig:
description: TLS configuration to use when scraping the endpoint
properties:
ca:
description: Certificate authority used when verifying
server certificates.
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
caFile:
description: Path to the CA cert in the Prometheus container
to use for the targets.
type: string
cert:
description: Client certificate to present when doing
client-authentication.
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
certFile:
description: Path to the client cert file in the Prometheus
container for the targets.
type: string
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keyFile:
description: Path to the client key file in the Prometheus
container for the targets.
type: string
keySecret:
description: Secret containing the client key file for
the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
type: object
type: array
jobLabel:
description: "JobLabel selects the label from the associated Kubernetes\
\ service which will be used as the `job` label for all metrics.\
\ \n For example: If in `ServiceMonitor.spec.jobLabel: foo` and\
\ in `Service.metadata.labels.foo: bar`, then the `job=\"bar\"\
` label is added to all metrics. \n If the value of this field\
\ is empty or if the label doesn't exist for the given Service,\
\ the `job` label of the metrics defaults to the name of the Kubernetes\
\ Service."
type: string
labelLimit:
description: Per-scrape limit on number of labels that will be accepted
for a sample. Only valid in Prometheus versions 2.27.0 and newer.
format: int64
type: integer
labelNameLengthLimit:
description: Per-scrape limit on length of labels name that will
be accepted for a sample. Only valid in Prometheus versions 2.27.0
and newer.
format: int64
type: integer
labelValueLengthLimit:
description: Per-scrape limit on length of labels value that will
be accepted for a sample. Only valid in Prometheus versions 2.27.0
and newer.
format: int64
type: integer
namespaceSelector:
description: Selector to select which namespaces the Kubernetes
Endpoints objects are discovered from.
properties:
any:
description: Boolean describing whether all namespaces are selected
in contrast to a list restricting them.
type: boolean
matchNames:
description: List of namespace names to select from.
items:
type: string
type: array
type: object
podTargetLabels:
description: PodTargetLabels transfers labels on the Kubernetes
`Pod` onto the created metrics.
items:
type: string
type: array
sampleLimit:
description: SampleLimit defines per-scrape limit on number of scraped
samples that will be accepted.
format: int64
type: integer
selector:
description: Selector to select Endpoints objects.
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
The requirements are ANDed.
items:
description: A label selector requirement is a selector that
contains values, a key, and an operator that relates the
key and values.
properties:
key:
description: key is the label key that the selector applies
to.
type: string
operator:
description: operator represents a key's relationship
to a set of values. Valid operators are In, NotIn, Exists
and DoesNotExist.
type: string
values:
description: values is an array of string values. If the
operator is In or NotIn, the values array must be non-empty.
If the operator is Exists or DoesNotExist, the values
array must be empty. This array is replaced during a
strategic merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator
is "In", and the values array contains only "value". The requirements
are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
targetLabels:
description: TargetLabels transfers labels from the Kubernetes `Service`
onto the created metrics.
items:
type: string
type: array
targetLimit:
description: TargetLimit defines a limit on the number of scraped
targets that will be accepted.
format: int64
type: integer
required:
- endpoints
- selector
type: object
required:
- spec
type: object
served: true
storage: true

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,38 @@
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: main
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
image: quay.io/prometheus/alertmanager:v0.26.0
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
replicas: 3
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: 0.26.0

View File

@ -0,0 +1,44 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: alertmanager-main
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 9093
protocol: TCP
- port: 8080
protocol: TCP
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: alertmanager
ports:
- port: 9094
protocol: TCP
- port: 9094
protocol: UDP
podSelector:
matchLabels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,21 @@
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: alertmanager-main
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
maxUnavailable: 1
selector:
matchLabels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,140 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
prometheus: k8s
role: alert-rules
name: alertmanager-main-rules
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace
}}/{{ $labels.pod}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: '# Without max_over_time, failed scrapes could create false negatives,
see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0
for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m])
== 0
'
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has
only found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster
members.
expr: "# Without max_over_time, failed scrapes could create false negatives,\
\ see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n max_over_time(alertmanager_cluster_members{job=\"alertmanager-main\"\
,namespace=\"monitoring\"}[5m])\n< on (namespace,service) group_left\n\
\ count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=\"\
alertmanager-main\",namespace=\"monitoring\"}[5m]))\n"
for: 15m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration
}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: "(\n rate(alertmanager_notifications_failed_total{job=\"alertmanager-main\"\
,namespace=\"monitoring\"}[5m])\n/\n rate(alertmanager_notifications_total{job=\"\
alertmanager-main\",namespace=\"monitoring\"}[5m])\n)\n> 0.01\n"
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value
| humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a critical integration.
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"\
alertmanager-main\",namespace=\"monitoring\", integration=~`.*`}[5m])\n\
/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"\
monitoring\", integration=~`.*`}[5m])\n)\n> 0.01\n"
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value
| humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a non-critical integration.
expr: "min by (namespace,service, integration) (\n rate(alertmanager_notifications_failed_total{job=\"\
alertmanager-main\",namespace=\"monitoring\", integration!~`.*`}[5m])\n\
/\n rate(alertmanager_notifications_total{job=\"alertmanager-main\",namespace=\"\
monitoring\", integration!~`.*`}[5m])\n)\n> 0.01\n"
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster
have different configurations.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different
configurations.
expr: "count by (namespace,service) (\n count_values by (namespace,service)\
\ (\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\"\
,namespace=\"monitoring\"})\n)\n!= 1\n"
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have been up for less than half of
the last 5m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster
are down.
expr: "(\n count by (namespace,service) (\n avg_over_time(up{job=\"\
alertmanager-main\",namespace=\"monitoring\"}[5m]) < 0.5\n )\n/\n count\
\ by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"\
monitoring\"}\n )\n)\n>= 0.5\n"
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have restarted at least 5 times in
the last 10m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster
are crashlooping.
expr: "(\n count by (namespace,service) (\n changes(process_start_time_seconds{job=\"\
alertmanager-main\",namespace=\"monitoring\"}[10m]) > 4\n )\n/\n count\
\ by (namespace,service) (\n up{job=\"alertmanager-main\",namespace=\"\
monitoring\"}\n )\n)\n>= 0.5\n"
for: 5m
labels:
severity: critical

View File

@ -0,0 +1,27 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: alertmanager-main
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
ports:
- name: web
port: 9093
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
sessionAffinity: ClientIP

View File

@ -0,0 +1,14 @@
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: alertmanager-main
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,25 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: alertmanager-main
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- interval: 30s
port: web
- interval: 30s
port: reloader-web
selector:
matchLabels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: blackbox-exporter
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create

View File

@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: blackbox-exporter
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: blackbox-exporter
subjects:
- kind: ServiceAccount
name: blackbox-exporter
namespace: monitoring

View File

@ -0,0 +1,27 @@
apiVersion: v1
data:
config.yml: "\"modules\":\n \"http_2xx\":\n \"http\":\n \"preferred_ip_protocol\"\
: \"ip4\"\n \"prober\": \"http\"\n \"http_post_2xx\":\n \"http\":\n \
\ \"method\": \"POST\"\n \"preferred_ip_protocol\": \"ip4\"\n \"prober\"\
: \"http\"\n \"irc_banner\":\n \"prober\": \"tcp\"\n \"tcp\":\n \"\
preferred_ip_protocol\": \"ip4\"\n \"query_response\":\n - \"send\"\
: \"NICK prober\"\n - \"send\": \"USER prober prober prober :prober\"\n \
\ - \"expect\": \"PING :([^ ]+)\"\n \"send\": \"PONG ${1}\"\n \
\ - \"expect\": \"^:[^ ]+ 001\"\n \"pop3s_banner\":\n \"prober\": \"tcp\"\n\
\ \"tcp\":\n \"preferred_ip_protocol\": \"ip4\"\n \"query_response\"\
:\n - \"expect\": \"^+OK\"\n \"tls\": true\n \"tls_config\":\n\
\ \"insecure_skip_verify\": false\n \"ssh_banner\":\n \"prober\": \"\
tcp\"\n \"tcp\":\n \"preferred_ip_protocol\": \"ip4\"\n \"query_response\"\
:\n - \"expect\": \"^SSH-2.0-\"\n \"tcp_connect\":\n \"prober\": \"tcp\"\
\n \"tcp\":\n \"preferred_ip_protocol\": \"ip4\""
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: blackbox-exporter-configuration
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,116 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: blackbox-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: blackbox-exporter
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
spec:
automountServiceAccountToken: true
containers:
- args:
- --config.file=/etc/blackbox_exporter/config.yml
- --web.listen-address=:19115
image: quay.io/prometheus/blackbox-exporter:v0.24.0
name: blackbox-exporter
ports:
- containerPort: 19115
name: http
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 65534
volumeMounts:
- mountPath: /etc/blackbox_exporter/
name: config
readOnly: true
- args:
- --webhook-url=http://localhost:19115/-/reload
- --volume-dir=/etc/blackbox_exporter/
image: jimmidyson/configmap-reload:v0.5.0
name: module-configmap-reloader
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 65534
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /etc/blackbox_exporter/
name: config
readOnly: true
- args:
- --secure-listen-address=:9115
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:19115/
image: quay.io/brancz/kube-rbac-proxy:v0.14.2
name: kube-rbac-proxy
ports:
- containerPort: 9115
name: https
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: blackbox-exporter
volumes:
- configMap:
name: blackbox-exporter-configuration
name: config

View File

@ -0,0 +1,33 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: blackbox-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 9115
protocol: TCP
- port: 19115
protocol: TCP
podSelector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,24 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: blackbox-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
ports:
- name: https
port: 9115
targetPort: https
- name: probe
port: 19115
targetPort: http
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,13 @@
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: blackbox-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,26 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: blackbox-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
path: /metrics
port: https
scheme: https
tlsConfig:
insecureSkipVerify: true
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: blackbox-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,19 @@
apiVersion: v1
kind: Secret
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana-config
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
stringData:
grafana.ini: '[date_formats]
default_timezone = UTC
'
type: Opaque

View File

@ -0,0 +1,19 @@
apiVersion: v1
kind: Secret
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana-datasources
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
stringData:
datasources.yaml: "{\n \"apiVersion\": 1,\n \"datasources\": [\n {\n\
\ \"access\": \"proxy\",\n \"editable\": false,\n \
\ \"name\": \"prometheus\",\n \"orgId\": 1,\n \"type\"\
: \"prometheus\",\n \"url\": \"http://prometheus-k8s.monitoring.svc:9090\"\
,\n \"version\": 1\n }\n ]\n}"
type: Opaque

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
apiVersion: v1
data:
dashboards.yaml: "{\n \"apiVersion\": 1,\n \"providers\": [\n {\n \
\ \"folder\": \"Default\",\n \"folderUid\": \"\",\n \
\ \"name\": \"0\",\n \"options\": {\n \"path\"\
: \"/grafana-dashboard-definitions/0\"\n },\n \"orgId\"\
: 1,\n \"type\": \"file\"\n }\n ]\n}"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana-dashboards
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,252 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
checksum/grafana-config: 5c598ba58d9b65011bdbb3864138399a
checksum/grafana-dashboardproviders: c9c1743868aa1c3dab60d2c402e2dcf0
checksum/grafana-datasources: 5ef0e6acaa5b4e8603740fbad440717d
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
spec:
automountServiceAccountToken: false
containers:
- env: []
image: grafana/grafana:9.5.3
name: grafana
ports:
- containerPort: 3000
name: http
readinessProbe:
httpGet:
path: /api/health
port: http
resources:
limits:
cpu: 200m
memory: 200Mi
requests:
cpu: 100m
memory: 100Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
seccompProfile:
type: RuntimeDefault
volumeMounts:
- mountPath: /var/lib/grafana
name: grafana-storage
readOnly: false
- mountPath: /etc/grafana/provisioning/datasources
name: grafana-datasources
readOnly: false
- mountPath: /etc/grafana/provisioning/dashboards
name: grafana-dashboards
readOnly: false
- mountPath: /tmp
name: tmp-plugins
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/alertmanager-overview
name: grafana-dashboard-alertmanager-overview
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/apiserver
name: grafana-dashboard-apiserver
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/cluster-total
name: grafana-dashboard-cluster-total
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/controller-manager
name: grafana-dashboard-controller-manager
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/grafana-overview
name: grafana-dashboard-grafana-overview
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
name: grafana-dashboard-k8s-resources-cluster
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-multicluster
name: grafana-dashboard-k8s-resources-multicluster
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace
name: grafana-dashboard-k8s-resources-namespace
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-node
name: grafana-dashboard-k8s-resources-node
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod
name: grafana-dashboard-k8s-resources-pod
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workload
name: grafana-dashboard-k8s-resources-workload
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace
name: grafana-dashboard-k8s-resources-workloads-namespace
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/kubelet
name: grafana-dashboard-kubelet
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/namespace-by-pod
name: grafana-dashboard-namespace-by-pod
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/namespace-by-workload
name: grafana-dashboard-namespace-by-workload
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-rsrc-use
name: grafana-dashboard-node-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/nodes-darwin
name: grafana-dashboard-nodes-darwin
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/nodes
name: grafana-dashboard-nodes
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage
name: grafana-dashboard-persistentvolumesusage
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/pod-total
name: grafana-dashboard-pod-total
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/prometheus-remote-write
name: grafana-dashboard-prometheus-remote-write
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/prometheus
name: grafana-dashboard-prometheus
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/proxy
name: grafana-dashboard-proxy
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/scheduler
name: grafana-dashboard-scheduler
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/workload-total
name: grafana-dashboard-workload-total
readOnly: false
- mountPath: /etc/grafana
name: grafana-config
readOnly: false
nodeSelector:
kubernetes.io/os: linux
securityContext:
fsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: grafana
volumes:
- emptyDir: {}
name: grafana-storage
- name: grafana-datasources
secret:
secretName: grafana-datasources
- configMap:
name: grafana-dashboards
name: grafana-dashboards
- emptyDir:
medium: Memory
name: tmp-plugins
- configMap:
name: grafana-dashboard-alertmanager-overview
name: grafana-dashboard-alertmanager-overview
- configMap:
name: grafana-dashboard-apiserver
name: grafana-dashboard-apiserver
- configMap:
name: grafana-dashboard-cluster-total
name: grafana-dashboard-cluster-total
- configMap:
name: grafana-dashboard-controller-manager
name: grafana-dashboard-controller-manager
- configMap:
name: grafana-dashboard-grafana-overview
name: grafana-dashboard-grafana-overview
- configMap:
name: grafana-dashboard-k8s-resources-cluster
name: grafana-dashboard-k8s-resources-cluster
- configMap:
name: grafana-dashboard-k8s-resources-multicluster
name: grafana-dashboard-k8s-resources-multicluster
- configMap:
name: grafana-dashboard-k8s-resources-namespace
name: grafana-dashboard-k8s-resources-namespace
- configMap:
name: grafana-dashboard-k8s-resources-node
name: grafana-dashboard-k8s-resources-node
- configMap:
name: grafana-dashboard-k8s-resources-pod
name: grafana-dashboard-k8s-resources-pod
- configMap:
name: grafana-dashboard-k8s-resources-workload
name: grafana-dashboard-k8s-resources-workload
- configMap:
name: grafana-dashboard-k8s-resources-workloads-namespace
name: grafana-dashboard-k8s-resources-workloads-namespace
- configMap:
name: grafana-dashboard-kubelet
name: grafana-dashboard-kubelet
- configMap:
name: grafana-dashboard-namespace-by-pod
name: grafana-dashboard-namespace-by-pod
- configMap:
name: grafana-dashboard-namespace-by-workload
name: grafana-dashboard-namespace-by-workload
- configMap:
name: grafana-dashboard-node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
- configMap:
name: grafana-dashboard-node-rsrc-use
name: grafana-dashboard-node-rsrc-use
- configMap:
name: grafana-dashboard-nodes-darwin
name: grafana-dashboard-nodes-darwin
- configMap:
name: grafana-dashboard-nodes
name: grafana-dashboard-nodes
- configMap:
name: grafana-dashboard-persistentvolumesusage
name: grafana-dashboard-persistentvolumesusage
- configMap:
name: grafana-dashboard-pod-total
name: grafana-dashboard-pod-total
- configMap:
name: grafana-dashboard-prometheus-remote-write
name: grafana-dashboard-prometheus-remote-write
- configMap:
name: grafana-dashboard-prometheus
name: grafana-dashboard-prometheus
- configMap:
name: grafana-dashboard-proxy
name: grafana-dashboard-proxy
- configMap:
name: grafana-dashboard-scheduler
name: grafana-dashboard-scheduler
- configMap:
name: grafana-dashboard-workload-total
name: grafana-dashboard-workload-total
- name: grafana-config
secret:
secretName: grafana-config

View File

@ -0,0 +1,31 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 3000
protocol: TCP
podSelector:
matchLabels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,42 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
prometheus: k8s
role: alert-rules
name: grafana-rules
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
groups:
- name: GrafanaAlerts
rules:
- alert: GrafanaRequestsFailing
annotations:
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler
}} is experiencing {{ $value | humanize }}% errors'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing
expr: '100 * sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query",
status_code=~"5.."})
/
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
> 50
'
for: 5m
labels:
severity: warning
- name: grafana_rules
rules:
- expr: 'sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
'
record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m

View File

@ -0,0 +1,21 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
ports:
- name: http
port: 3000
targetPort: http
selector:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,13 @@
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,19 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.5.3
name: grafana
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- interval: 15s
port: http
selector:
matchLabels:
app.kubernetes.io/name: grafana

View File

@ -0,0 +1,117 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
role: alert-rules
name: kube-prometheus-rules
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{
$labels.service }} targets in {{ $labels.namespace }} namespace are
down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
BY (cluster, job, namespace, service)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting
pipeline is functional.
This alert is always firing, therefore it should always be firing in
Alertmanager
and always fire against a receiver. There are integrations with various
notification
mechanisms that send a notification when this alert is not firing. For
example the
"DeadMansSnitch" integration in PagerDuty.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager
is working properly.
expr: vector(1)
labels:
severity: none
- alert: InfoInhibitor
annotations:
description: 'This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they
are relevant when combined with
other alerts.
This alert fires whenever there''s a severity="info" alert, and stops
firing when another alert with a
severity of ''warning'' or ''critical'' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit
alerts with severity="info".
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname
!= "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"}
== 1
labels:
severity: none
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: Network interface "{{ $labels.device }}" changing its up
status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod
}}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
summary: Network interface is often changing its status
expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m])
> 2
'
for: 2m
labels:
severity: warning
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY
(instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0

View File

@ -0,0 +1,131 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
name: kube-state-metrics
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- ''
resources:
- configmaps
- secrets
- nodes
- pods
- services
- serviceaccounts
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs:
- list
- watch
- apiGroups:
- apps
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- batch
resources:
- cronjobs
- jobs
verbs:
- list
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- list
- watch
- apiGroups:
- certificates.k8s.io
resources:
- certificatesigningrequests
verbs:
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- apiGroups:
- storage.k8s.io
resources:
- storageclasses
- volumeattachments
verbs:
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs:
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
- ingressclasses
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterrolebindings
- clusterroles
- rolebindings
- roles
verbs:
- list
- watch

View File

@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
name: kube-state-metrics
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitoring

View File

@ -0,0 +1,108 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
name: kube-state-metrics
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: kube-state-metrics
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
spec:
automountServiceAccountToken: true
containers:
- args:
- --host=127.0.0.1
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.9.2
name: kube-state-metrics
resources:
limits:
cpu: 100m
memory: 250Mi
requests:
cpu: 10m
memory: 190Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 65534
seccompProfile:
type: RuntimeDefault
- args:
- --secure-listen-address=:8443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8081/
image: quay.io/brancz/kube-rbac-proxy:v0.14.2
name: kube-rbac-proxy-main
ports:
- containerPort: 8443
name: https-main
resources:
limits:
cpu: 40m
memory: 40Mi
requests:
cpu: 20m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
- args:
- --secure-listen-address=:9443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8082/
image: quay.io/brancz/kube-rbac-proxy:v0.14.2
name: kube-rbac-proxy-self
ports:
- containerPort: 9443
name: https-self
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: kube-state-metrics

View File

@ -0,0 +1,33 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
name: kube-state-metrics
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 8443
protocol: TCP
- port: 9443
protocol: TCP
podSelector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,70 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
prometheus: k8s
role: alert-rules
name: kube-state-metrics-rules
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated
rate in list operations. This is likely causing it to not be able to
expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"\
,result=\"error\"}[5m])) by (cluster)\n /\nsum(rate(kube_state_metrics_list_total{job=\"\
kube-state-metrics\"}[5m])) by (cluster))\n> 0.01\n"
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated
rate in watch operations. This is likely causing it to not be able to
expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"\
,result=\"error\"}[5m])) by (cluster)\n /\nsum(rate(kube_state_metrics_watch_total{job=\"\
kube-state-metrics\"}[5m])) by (cluster))\n> 0.01\n"
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards
configuration, some Kubernetes objects may be exposed multiple times
or not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: 'stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"})
by (cluster) != 0
'
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects
are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: "2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"\
}) by (cluster) - 1\n -\nsum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"\
kube-state-metrics\"}) ) by (cluster)\n!= 0\n"
for: 15m
labels:
severity: critical

View File

@ -0,0 +1,25 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
name: kube-state-metrics
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
clusterIP: None
ports:
- name: https-main
port: 8443
targetPort: https-main
- name: https-self
port: 9443
targetPort: https-self
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,13 @@
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
name: kube-state-metrics
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,42 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.9.2
name: kube-state-metrics
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
metricRelabelings:
- action: drop
regex: kube_endpoint_address_not_ready|kube_endpoint_address_available
sourceLabels:
- __name__
port: https-main
relabelings:
- action: labeldrop
regex: (pod|service|endpoint|namespace)
scheme: https
scrapeTimeout: 30s
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
port: https-self
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,77 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: apiserver
app.kubernetes.io/part-of: kube-prometheus
name: kube-apiserver
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|server).*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_controller_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_step_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
sourceLabels:
- __name__
- le
port: https
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
jobLabel: component
namespaceSelector:
matchNames:
- default
selector:
matchLabels:
component: apiserver
provider: kubernetes

View File

@ -0,0 +1,27 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: coredns
app.kubernetes.io/part-of: kube-prometheus
name: coredns
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
metricRelabelings:
- action: drop
regex: coredns_cache_misses_total
sourceLabels:
- __name__
port: metrics
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-dns

View File

@ -0,0 +1,62 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: kube-controller-manager
app.kubernetes.io/part-of: kube-prometheus
name: kube-controller-manager
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|request|server).*
sourceLabels:
- __name__
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kube-controller-manager

View File

@ -0,0 +1,25 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: kube-scheduler
app.kubernetes.io/part-of: kube-prometheus
name: kube-scheduler
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kube-scheduler

View File

@ -0,0 +1,107 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: kubelet
app.kubernetes.io/part-of: kube-prometheus
name: kubelet
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
sourceLabels:
- __name__
port: https-metrics
relabelings:
- action: replace
sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
honorTimestamps: false
interval: 30s
metricRelabelings:
- action: drop
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
sourceLabels:
- __name__
- action: drop
regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
sourceLabels:
- __name__
- pod
- namespace
- action: drop
regex: (container_blkio_device_usage_total);.+
sourceLabels:
- __name__
- container
path: /metrics/cadvisor
port: https-metrics
relabelings:
- action: replace
sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/probes
port: https-metrics
relabelings:
- action: replace
sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kubelet

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
annotations:
argocd.argoproj.io/sync-wave: '0'

View File

@ -0,0 +1,24 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
name: node-exporter
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create

View File

@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
name: node-exporter
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-exporter
subjects:
- kind: ServiceAccount
name: node-exporter
namespace: monitoring

View File

@ -0,0 +1,120 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
name: node-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: node-exporter
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
spec:
automountServiceAccountToken: true
containers:
- args:
- --web.listen-address=127.0.0.1:9100
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --path.udev.data=/host/root/run/udev/data
- --no-collector.wifi
- --no-collector.hwmon
- --no-collector.btrfs
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run/k3s/containerd/.+|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
- --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
image: quay.io/prometheus/node-exporter:v1.6.1
name: node-exporter
resources:
limits:
cpu: 250m
memory: 180Mi
requests:
cpu: 102m
memory: 180Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- SYS_TIME
drop:
- ALL
readOnlyRootFilesystem: true
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root
readOnly: true
- args:
- --secure-listen-address=[$(IP)]:9100
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:9100/
env:
- name: IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: quay.io/brancz/kube-rbac-proxy:v0.14.2
name: kube-rbac-proxy
ports:
- containerPort: 9100
hostPort: 9100
name: https
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
hostNetwork: true
hostPID: true
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
securityContext:
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: node-exporter
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root
updateStrategy:
rollingUpdate:
maxUnavailable: 10%
type: RollingUpdate

View File

@ -0,0 +1,31 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
name: node-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 9100
protocol: TCP
podSelector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,429 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
prometheus: k8s
role: alert-rules
name: node-exporter-rules
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
groups:
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24
hours.
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0\nand\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n)\n"
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4
hours.
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 10\nand\n predict_linear(node_filesystem_avail_bytes{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0\nand\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n)\n"
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0\n)\n"
for: 30m
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0\n)\n"
for: 30m
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next
24 hours.
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
,mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0\nand\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n)\n"
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next
4 hours.
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
,mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0\nand\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n)\n"
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
,mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0\n)\n"
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
,mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\
\",mountpoint!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0\n)\n"
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has
encountered {{ printf "%.0f" $value }} receive errors in the last two
minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: 'rate(node_network_receive_errs_total{job="node-exporter"}[2m]) /
rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
'
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has
encountered {{ printf "%.0f" $value }} transmit errors in the last two
minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: 'rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) /
rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
'
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are
used.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: '(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
> 0.75
'
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector on {{ $labels.instance
}} failed to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: 'node_textfile_scrape_error{job="node-exporter"} == 1
'
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock at {{ $labels.instance }} is out of sync by more than
0.05s. Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
expr: "(\n node_timex_offset_seconds{job=\"node-exporter\"} > 0.05\nand\n\
\ deriv(node_timex_offset_seconds{job=\"node-exporter\"}[5m]) >= 0\n\
)\nor\n(\n node_timex_offset_seconds{job=\"node-exporter\"} < -0.05\n\
and\n deriv(node_timex_offset_seconds{job=\"node-exporter\"}[5m]) <=\
\ 0\n)\n"
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock at {{ $labels.instance }} is not synchronising. Ensure
NTP is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: 'min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) ==
0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
'
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' at {{ $labels.instance
}} is in degraded state due to one or more disks failures. Number of
spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded.
expr: 'node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
> 0
'
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array at {{ $labels.instance
}} failed. Array '{{ $labels.device }}' needs attention and possibly
a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array.
expr: 'node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
> 0
'
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\
node-exporter\"} > 70\n)\n"
for: 15m
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\
node-exporter\"} > 90\n)\n"
for: 15m
labels:
severity: critical
- alert: NodeCPUHighUsage
annotations:
description: 'CPU usage at {{ $labels.instance }} has been above 90% for
the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
summary: High CPU usage.
expr: 'sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
mode!="idle"}[2m]))) * 100 > 90
'
for: 15m
labels:
severity: info
- alert: NodeSystemSaturation
annotations:
description: 'System load per core at {{ $labels.instance }} has been
above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value
}}.
This might indicate this instance resources saturation and can cause
it becoming unresponsive.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
summary: System saturated, load per core is very high.
expr: 'node_load1{job="node-exporter"}
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter",
mode="idle"}) > 2
'
for: 15m
labels:
severity: warning
- alert: NodeMemoryMajorPagesFaults
annotations:
description: 'Memory major pages are occurring at very high rate at {{
$labels.instance }}, 500 major page faults per second for the last 15
minutes, is currently at {{ printf "%.2f" $value }}.
Please check that there is enough memory available at this instance.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
summary: Memory major page faults are occurring at very high rate.
expr: 'rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
'
for: 15m
labels:
severity: warning
- alert: NodeMemoryHighUtilization
annotations:
description: 'Memory is filling up at {{ $labels.instance }}, has been
above 90% for the last 15 minutes, is currently at {{ printf "%.2f"
$value }}%.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: Host is running out of memory.
expr: '100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
* 100) > 90
'
for: 15m
labels:
severity: warning
- alert: NodeDiskIOSaturation
annotations:
description: 'Disk IO queue (aqu-sq) is high on {{ $labels.device }} at
{{ $labels.instance }}, has been above 10 for the last 15 minutes, is
currently at {{ printf "%.2f" $value }}.
This symptom might indicate disk saturation.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
summary: Disk IO queue is high.
expr: 'rate(node_disk_io_time_weighted_seconds_total{job="node-exporter",
device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
> 10
'
for: 30m
labels:
severity: warning
- alert: NodeSystemdServiceFailed
annotations:
description: Systemd service {{ $labels.name }} has entered failed state
at {{ $labels.instance }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Systemd service has entered failed state.
expr: 'node_systemd_unit_state{job="node-exporter", state="failed"} == 1
'
for: 5m
labels:
severity: warning
- alert: NodeBondingDegraded
annotations:
description: Bonding interface {{ $labels.master }} on {{ $labels.instance
}} is in degraded state due to one or more slave failures.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: Bonding interface is degraded
expr: '(node_bonding_slaves - node_bonding_active) != 0
'
for: 5m
labels:
severity: warning
- name: node-exporter.rules
rules:
- expr: "count without (cpu, mode) (\n node_cpu_seconds_total{job=\"node-exporter\"\
,mode=\"idle\"}\n)\n"
record: instance:node_num_cpu:sum
- expr: "1 - avg without (cpu) (\n sum without (mode) (rate(node_cpu_seconds_total{job=\"\
node-exporter\", mode=~\"idle|iowait|steal\"}[5m]))\n)\n"
record: instance:node_cpu_utilisation:rate5m
- expr: "(\n node_load1{job=\"node-exporter\"}\n/\n instance:node_num_cpu:sum{job=\"\
node-exporter\"}\n)\n"
record: instance:node_load1_per_cpu:ratio
- expr: "1 - (\n (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"\
}\n or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"\
}\n +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n \
\ +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n \
\ +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n )\n\
/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n"
record: instance:node_memory_utilisation:ratio
- expr: 'rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
'
record: instance:node_vmstat_pgmajfault:rate5m
- expr: 'rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
'
record: instance_device:node_disk_io_time_seconds:rate5m
- expr: 'rate(node_disk_io_time_weighted_seconds_total{job="node-exporter",
device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
'
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: "sum without (device) (\n rate(node_network_receive_bytes_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n)\n"
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- expr: "sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n)\n"
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- expr: "sum without (device) (\n rate(node_network_receive_drop_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n)\n"
record: instance:node_network_receive_drop_excluding_lo:rate5m
- expr: "sum without (device) (\n rate(node_network_transmit_drop_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n)\n"
record: instance:node_network_transmit_drop_excluding_lo:rate5m

View File

@ -0,0 +1,22 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
name: node-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
clusterIP: None
ports:
- name: https
port: 9100
targetPort: https
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,13 @@
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
name: node-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,33 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.6.1
name: node-exporter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
port: https
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,23 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- ''
resources:
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get

View File

@ -0,0 +1,20 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@ -0,0 +1,49 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 9090
protocol: TCP
- port: 8080
protocol: TCP
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus-adapter
ports:
- port: 9090
protocol: TCP
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: grafana
ports:
- port: 9090
protocol: TCP
podSelector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,21 @@
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
minAvailable: 1
selector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,50 @@
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: k8s
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
alerting:
alertmanagers:
- apiVersion: v2
name: alertmanager-main
namespace: monitoring
port: web
enableFeatures: []
externalLabels: {}
image: quay.io/prometheus/prometheus:v2.46.0
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
probeNamespaceSelector: {}
probeSelector: {}
replicas: 2
resources:
requests:
memory: 400Mi
ruleNamespaceSelector: {}
ruleSelector: {}
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: prometheus-k8s
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
version: 2.46.0

View File

@ -0,0 +1,348 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
prometheus: k8s
role: alert-rules
name: prometheus-k8s-prometheus-rules
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed
to reload its configuration.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: '# Without max_over_time, failed scrapes could create false negatives,
see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0
for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m])
== 0
'
for: 10m
labels:
severity: critical
- alert: PrometheusSDRefreshFailure
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed
to refresh SD with mechanism {{$labels.mechanism}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure
summary: Failed Prometheus SD refresh.
expr: 'increase(prometheus_sd_refresh_failures_total{job="prometheus-k8s",namespace="monitoring"}[10m])
> 0
'
for: 20m
labels:
severity: warning
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in
less than 30m.
expr: "# Without min_over_time, failed scrapes could create false negatives,\
\ see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
expr: "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"\
,namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n> 1\n"
for: 15m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected
to any Alertmanagers.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: '# Without max_over_time, failed scrapes could create false negatives,
see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0
for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m])
< 1
'
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} reload failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: 'increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h])
> 0
'
for: 4h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} compaction failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: 'increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h])
> 0
'
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
samples.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\"\
,namespace=\"monitoring\"}[5m]) <= 0\nand\n (\n sum without(scrape_job)\
\ (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"\
monitoring\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}) > 0\n )\n)\n"
for: 10m
labels:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with different values but duplicated
timestamp.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: 'rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 10m
labels:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of
order.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: 'rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 10m
labels:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to
send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
$labels.url }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n> 1\n"
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{
$labels.url }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: "# Without max_over_time, failed scrapes could create false negatives,\
\ see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name,\
\ url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n> 120\n"
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
desired shards calculation wants to run {{ $value }} shards for queue
{{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max
of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
$labels.instance | query | first | value }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run
more than configured max shards.
expr: "# Without max_over_time, failed scrapes could create false negatives,\
\ see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m])\n>\n max_over_time(prometheus_remote_storage_shards_max{job=\"\
prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n"
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed
to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: 'increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 15m
labels:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed
{{ printf "%.0f" $value }} rule group evaluations in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group
evaluation.
expr: 'increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 15m
labels:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
{{ printf "%.0f" $value }} targets because the number of targets exceeded
the configured target_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have
exceeded the targets limit.
expr: 'increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 15m
labels:
severity: warning
- alert: PrometheusLabelLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
{{ printf "%.0f" $value }} targets because some samples exceeded the
configured label_limit, label_name_length_limit or label_value_length_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
summary: Prometheus has dropped targets because some scrape configs have
exceeded the labels limit.
expr: 'increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeBodySizeLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed
{{ printf "%.0f" $value }} scrapes in the last 5m because some targets
exceeded the configured body_size_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
summary: Prometheus has dropped some targets that exceeded body size limit.
expr: 'increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeSampleLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed
{{ printf "%.0f" $value }} scrapes in the last 5m because some targets
exceeded the configured sample_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
summary: Prometheus has failed scrapes that have exceeded the configured
sample limit.
expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0
'
for: 15m
labels:
severity: warning
- alert: PrometheusTargetSyncFailure
annotations:
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}}
have failed to sync because invalid configuration was supplied.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
summary: Prometheus has failed to sync targets.
expr: 'increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m])
> 0
'
for: 5m
labels:
severity: critical
- alert: PrometheusHighQueryLoad
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API
has less than 20% available capacity in its query engine for the last
15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
summary: Prometheus is reaching its maximum capacity serving concurrent
requests.
expr: 'avg_over_time(prometheus_engine_queries{job="prometheus-k8s",namespace="monitoring"}[5m])
/ max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-k8s",namespace="monitoring"}[5m])
> 0.8
'
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending
alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any
Alertmanager.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any
Alertmanager.
expr: "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"\
prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n\
\ rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"\
monitoring\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n"
for: 15m
labels:
severity: critical

View File

@ -0,0 +1,21 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s-config
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s-config
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@ -0,0 +1,63 @@
apiVersion: rbac.authorization.k8s.io/v1
items:
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
kind: RoleBindingList
metadata:
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,20 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s-config
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- ''
resources:
- configmaps
verbs:
- get

View File

@ -0,0 +1,120 @@
apiVersion: rbac.authorization.k8s.io/v1
items:
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: default
rules:
- apiGroups:
- ''
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: kube-system
rules:
- apiGroups:
- ''
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: monitoring
rules:
- apiGroups:
- ''
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
kind: RoleList
metadata:
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,27 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
ports:
- name: web
port: 9090
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
sessionAffinity: ClientIP

View File

@ -0,0 +1,14 @@
apiVersion: v1
automountServiceAccountToken: true
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,25 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.46.0
name: prometheus-k8s
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- interval: 30s
port: web
- interval: 30s
port: reloader-web
selector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,20 @@
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: v1beta1.metrics.k8s.io
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: prometheus-adapter
namespace: monitoring
version: v1beta1
versionPriority: 100

View File

@ -0,0 +1,23 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- ''
resources:
- nodes
- namespaces
- pods
- services
verbs:
- get
- list
- watch

View File

@ -0,0 +1,24 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
rbac.authorization.k8s.io/aggregate-to-admin: 'true'
rbac.authorization.k8s.io/aggregate-to-edit: 'true'
rbac.authorization.k8s.io/aggregate-to-view: 'true'
name: system:aggregated-metrics-reader
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list
- watch

View File

@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-adapter
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring

View File

@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: resource-metrics:system:auth-delegator
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring

View File

@ -0,0 +1,18 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: resource-metrics-server-resources
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- metrics.k8s.io
resources:
- '*'
verbs:
- '*'

View File

@ -0,0 +1,35 @@
apiVersion: v1
data:
config.yaml: "\"resourceRules\":\n \"cpu\":\n \"containerLabel\": \"container\"\
\n \"containerQuery\": |\n sum by (<<.GroupBy>>) (\n irate (\n\
\ container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"\
\",pod!=\"\"}[120s]\n )\n )\n \"nodeQuery\": |\n sum by (<<.GroupBy>>)\
\ (\n 1 - irate(\n node_cpu_seconds_total{mode=\"idle\"}[60s]\n\
\ )\n * on(namespace, pod) group_left(node) (\n node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}\n\
\ )\n )\n or sum by (<<.GroupBy>>) (\n 1 - irate(\n \
\ windows_cpu_time_total{mode=\"idle\", job=\"windows-exporter\",<<.LabelMatchers>>}[4m]\n\
\ )\n )\n \"resources\":\n \"overrides\":\n \"namespace\"\
:\n \"resource\": \"namespace\"\n \"node\":\n \"resource\"\
: \"node\"\n \"pod\":\n \"resource\": \"pod\"\n \"memory\":\n\
\ \"containerLabel\": \"container\"\n \"containerQuery\": |\n sum by\
\ (<<.GroupBy>>) (\n container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"\
\",pod!=\"\"}\n )\n \"nodeQuery\": |\n sum by (<<.GroupBy>>) (\n\
\ node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>}\n\
\ -\n node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}\n\
\ )\n or sum by (<<.GroupBy>>) (\n windows_cs_physical_memory_bytes{job=\"\
windows-exporter\",<<.LabelMatchers>>}\n -\n windows_memory_available_bytes{job=\"\
windows-exporter\",<<.LabelMatchers>>}\n )\n \"resources\":\n \"\
overrides\":\n \"instance\":\n \"resource\": \"node\"\n \
\ \"namespace\":\n \"resource\": \"namespace\"\n \"pod\":\n \
\ \"resource\": \"pod\"\n \"window\": \"5m\""
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: adapter-config
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,102 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
replicas: 2
selector:
matchLabels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
template:
metadata:
annotations:
checksum.config/md5: 3b1ebf7df0232d1675896f67b66373db
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
spec:
automountServiceAccountToken: true
containers:
- args:
- --cert-dir=/var/run/serving-cert
- --config=/etc/adapter/config.yaml
- --metrics-relist-interval=1m
- --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/
- --secure-port=6443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
image: registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.11.1
livenessProbe:
failureThreshold: 5
httpGet:
path: /livez
port: https
scheme: HTTPS
periodSeconds: 5
name: prometheus-adapter
ports:
- containerPort: 6443
name: https
readinessProbe:
failureThreshold: 5
httpGet:
path: /readyz
port: https
scheme: HTTPS
periodSeconds: 5
resources:
limits:
cpu: 250m
memory: 180Mi
requests:
cpu: 102m
memory: 180Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
startupProbe:
failureThreshold: 18
httpGet:
path: /livez
port: https
scheme: HTTPS
periodSeconds: 10
volumeMounts:
- mountPath: /tmp
name: tmpfs
readOnly: false
- mountPath: /var/run/serving-cert
name: volume-serving-cert
readOnly: false
- mountPath: /etc/adapter
name: config
readOnly: false
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: prometheus-adapter
volumes:
- emptyDir: {}
name: tmpfs
- emptyDir: {}
name: volume-serving-cert
- configMap:
name: adapter-config
name: config

View File

@ -0,0 +1,25 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- {}
podSelector:
matchLabels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,19 @@
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
minAvailable: 1
selector:
matchLabels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,20 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: resource-metrics-auth-reader
namespace: kube-system
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring

View File

@ -0,0 +1,21 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
ports:
- name: https
port: 443
targetPort: 6443
selector:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,13 @@
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,30 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.11.1
name: prometheus-adapter
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: (apiserver_client_certificate_.*|apiserver_envelope_.*|apiserver_flowcontrol_.*|apiserver_storage_.*|apiserver_webhooks_.*|workqueue_.*)
sourceLabels:
- __name__
port: https
scheme: https
tlsConfig:
insecureSkipVerify: true
selector:
matchLabels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,101 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
name: prometheus-operator
annotations:
argocd.argoproj.io/sync-wave: '1'
rules:
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- alertmanagers/finalizers
- alertmanagers/status
- alertmanagerconfigs
- prometheuses
- prometheuses/finalizers
- prometheuses/status
- prometheusagents
- prometheusagents/finalizers
- prometheusagents/status
- thanosrulers
- thanosrulers/finalizers
- thanosrulers/status
- scrapeconfigs
- servicemonitors
- podmonitors
- probes
- prometheusrules
verbs:
- '*'
- apiGroups:
- apps
resources:
- statefulsets
verbs:
- '*'
- apiGroups:
- ''
resources:
- configmaps
- secrets
verbs:
- '*'
- apiGroups:
- ''
resources:
- pods
verbs:
- list
- delete
- apiGroups:
- ''
resources:
- services
- services/finalizers
- endpoints
verbs:
- get
- create
- update
- delete
- apiGroups:
- ''
resources:
- nodes
verbs:
- list
- watch
- apiGroups:
- ''
resources:
- namespaces
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create

View File

@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
name: prometheus-operator
annotations:
argocd.argoproj.io/sync-wave: '1'
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-operator
subjects:
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring

View File

@ -0,0 +1,85 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
name: prometheus-operator
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: prometheus-operator
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
spec:
automountServiceAccountToken: true
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.67.1
image: quay.io/prometheus-operator/prometheus-operator:v0.67.1
name: prometheus-operator
ports:
- containerPort: 8080
name: http
resources:
limits:
cpu: 200m
memory: 200Mi
requests:
cpu: 100m
memory: 100Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
- args:
- --secure-listen-address=:8443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8080/
image: quay.io/brancz/kube-rbac-proxy:v0.14.2
name: kube-rbac-proxy
ports:
- containerPort: 8443
name: https
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
nodeSelector:
kubernetes.io/os: linux
securityContext:
runAsNonRoot: true
runAsUser: 65534
seccompProfile:
type: RuntimeDefault
serviceAccountName: prometheus-operator

View File

@ -0,0 +1,31 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
name: prometheus-operator
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 8443
protocol: TCP
podSelector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress

View File

@ -0,0 +1,132 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
prometheus: k8s
role: alert-rules
name: prometheus-operator-rules
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
groups:
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: '(sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m]))
/ sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m])))
> 0.4
'
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: '(sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m]))
/ sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m])))
> 0.4
'
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace
}} namespace fails to reconcile {{ $value }} objects.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: 'min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m])
> 0
'
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations
failed for {{ $labels.controller }} controller in {{ $labels.namespace
}} namespace.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: '(sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m])))
/ (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m])))
> 0.1
'
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace
}} Namespace.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: 'rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m])
> 0.1
'
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace
isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
expr: 'min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m])
== 0)
'
for: 5m
labels:
severity: warning
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace
rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource
}} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources
summary: Resources rejected by Prometheus operator
expr: 'min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m])
> 0
'
for: 5m
labels:
severity: warning
- name: config-reloaders
rules:
- alert: ConfigReloaderSidecarErrors
annotations:
description: 'Errors encountered while the {{$labels.pod}} config-reloader
sidecar attempts to sync config in {{$labels.namespace}} namespace.
As a result, configuration for service running in {{$labels.pod}} may
be stale and cannot be updated anymore.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors
summary: config-reloader sidecar has not had a successful reload for 10m
expr: 'max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m])
== 0
'
for: 10m
labels:
severity: warning

View File

@ -0,0 +1,22 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
name: prometheus-operator
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
clusterIP: None
ports:
- name: https
port: 8443
targetPort: https
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,13 @@
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
name: prometheus-operator
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'

View File

@ -0,0 +1,26 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1
name: prometheus-operator
namespace: monitoring
annotations:
argocd.argoproj.io/sync-wave: '1'
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
port: https
scheme: https
tlsConfig:
insecureSkipVerify: true
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.67.1

View File

@ -0,0 +1,17 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: kube-prometheus
namespace: argocd
spec:
destination:
namespace: monitoring
server: https://kubernetes.default.svc
project: infra
source:
path: apps-kustomized/kube-prometheus
repoURL: https://git.martyn.berlin/martyn/infra4talos
targetRevision: HEAD
syncPolicy:
automated:
selfHeal: true

39
vendor-stuff.sh Executable file
View File

@ -0,0 +1,39 @@
#!/bin/bash
# the directory of the script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# the temp directory used, within $DIR
# omit the -p parameter to create a temporal directory in the default location
WORK_DIR=`mktemp -d -p "$DIR"`
# check if tmp dir was created
if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then
echo "Could not create temp dir"
exit 1
fi
# deletes the temp directory
function cleanup {
rm -rf "$WORK_DIR"
echo "Deleted temp working directory $WORK_DIR"
}
# register the cleanup function to be called on the EXIT signal
trap cleanup EXIT
# implementation of script starts here
KUBE_PROM_VERSION=release-0.13
cd $WORK_DIR
git clone https://github.com/prometheus-operator/kube-prometheus.git
cd kube-prometheus
git checkout $KUBE_PROM_VERSION
rm manifests/alertmanager-secret.yaml
for y in manifests/setup/*.yaml; do
yq -y '.metadata.annotations += {"argocd.argoproj.io/sync-wave":"0"}' $y > $DIR/apps-kustomized/kube-prometheus/`basename $y`
done
for y in manifests/*.yaml; do
yq -y '.metadata.annotations += {"argocd.argoproj.io/sync-wave":"1"}' $y > $DIR/apps-kustomized/kube-prometheus/`basename $y`
done