Kubernetes. Запуск prometheus/grafana/alertmanager — запуск exporter для ingress-nginx-controller

1.Установка prometheus
2.exporter nginx(ingress-controller)
3.exporter elasticsearch
4.exporter rabbitmq
5.exporter redis
6.настройка оповещений в telegram
6.1 настройка оповещений в telegram в различные чаты(группы)
6.2. настройка оповещений в telegram разграничение оповещений по группам (исключения уведомлений)
7.Проблема с prometheus-kube-proxy
8.Настройка алерта для определённого неймспейса
9.Добавление оповещений и по email
10. Настройка графиков в grafana

Качаем репозиторий

git clone https://github.com/prometheus-community/helm-charts.git
cd helm-charts/charts/kube-prometheus-stack/
докачиваем чарты:
helm dep update

создаём namescpase в котором будет всё крутиться:
kubectl create ns monitoring

теперь рассмотрим что правим в переменных у helm chart:

[root@prod-vsrv-kubemaster1 charts]# vim kube-prometheus-stack/values.yaml

namespaceOverride: «monitoring»

для работы telegram бота:

  ## Alertmanager configuration directives
  ## ref: https://prometheus.io/docs/alerting/configuration/#configuration-file
  ##      https://prometheus.io/webtools/alerting/routing-tree-editor/
  ##
  config:
    global:
      resolve_timeout: 5m
    route:
        receiver: 'telegram'
        routes:
        - match:
            severity: critical
          repeat_interval: 48h
          continue: true
          receiver: 'telegram'
        - match:
            alertname: Watchdog
          repeat_interval: 48h
          continue: true
          receiver: 'telegram'
    receivers:
        - name: 'telegram'
          webhook_configs:
              - send_resolved: true
                url: 'http://alertmanager-bot:8080'
    templates:
    - '/etc/alertmanager/config/*.tmpl'

настраиваем ingress у alertmanager:

  ingress:                                                    
    enabled: true                                             
    hosts:                                                    
      - alertmanager.prod.test.local                       
    paths:                                                    
     - /

настраиваем volume для Alertmanager отмечу что в кластере настроен nfs-provisioner — nfs-storageclass

  ingress:                                                    
    enabled: true                                             
    hosts:                                                    
      - alertmanager.prod.test.local                       
    paths:                                                    
     - /

теперь настроим grafana

тут указываем ingress а также добавляем хранение dashboard в nfs storage-class

grafana:
  enabled: true
  namespaceOverride: "monitoring"
  ## Deploy default dashboards.
  ##
  defaultDashboardsEnabled: true
  adminPassword: prom-operator
  ingress:
    ## If true, Grafana Ingress will be created
    ##
    enabled: true
     labels: {}
    ## Hostnames.
    ## Must be provided if Ingress is enable.
    ##
    hosts:
      - grafana.prod.test.local
    #hosts: []
    ## Path for grafana ingress
    path: /
    ## TLS configuration for grafana Ingress
    ## Secret must be manually created in the namespace
    ##
    tls: []
    # - secretName: grafana-general-tls
    #   hosts:
    #   - grafana.example.com
  persistence:
    type: pvc
    enabled: true
    storageClassName: nfs-storageclass
    accessModes:
      - ReadWriteMany
    size: 5Gi
    # annotations: {}
    finalizers:
      - kubernetes.io/pvc-protection

  ## If using kubeControllerManager.endpoints only the port and targetPort are used
  ##
  service:
    port: 10252
    targetPort: 10252
    selector:
      k8s-app: kube-controller-manager
    #   component: kube-controller-manager

  ## If using kubeScheduler.endpoints only the port and targetPort are used
  ##
  service:
    port: 10251
    targetPort: 10251
    selector:
      k8s-app: kube-scheduler
    #   component: kube-scheduler

## Configuration for kube-state-metrics subchart
##
kube-state-metrics:
  namespaceOverride: "monitoring"
  rbac:
    create: true
  podSecurityPolicy:
    enabled: true

## Configuration for prometheus-node-exporter subchart
##
prometheus-node-exporter:
  namespaceOverride: "monitoring"

теперь настраиваем ingress для prometheus

  ingress:                                         
    enabled: true                                  
    annotations: {}                                
    labels: {}                                     
    ## Hostnames.                                  
    ## Must be provided if Ingress is enabled.     
    ##                                             
    hosts:                                         
      - prometheus.prod.test.local             
                                         
    ## Paths to use for ingress rules -            
    ##                                             
    paths:                                         
     - /

а так же volume:

    ## Prometheus StorageSpec for persistent data
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/user-guides/storage.md
    ##
    storageSpec:                                              
      volumeClaimTemplate:                                    
        spec:                                                 
          storageClassName: nfs-storageclass                  
          accessModes: ["ReadWriteMany"]                      
          resources:                                          
            requests:                                         
              storage: 10Gi

и теперь важная фишка, добавление label который надо будет добавить на все неймспейсы:

    ## Namespaces to be selected for ServiceMonitor discovery.
    ##
    serviceMonitorNamespaceSelector:
       matchLabels:
         prometheus: enabled

    ## Log level for Alertmanager to be configured with.
    ##
    logLevel: info

    ## Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the
    ## running cluster equal to the expected size.
    replicas: 3

также правим:

    ## Enable scraping /metrics/resource from kubelet's service
    ## This is disabled by default because container metrics are already exposed by cAdvisor
    ##
    resource: true

для выставления срока хранения данных можем поменять следующее значение:

    ## Time duration Alertmanager shall retain data for. Default is '120h', and must match the regular expression
    ## [0-9]+(ms|s|m|h) (milliseconds seconds minutes hours).
    ##
    retention: 120h

запускаем теперь helm chart

[root@prod-vsrv-kubemaster1 charts]# helm upgrade —install -name prometheus kube-prometheus-stack/ -f kube-prometheus-stack/values.yaml —namespace monitoring

Release "prometheus" does not exist. Installing it now.
NAME: prometheus
LAST DEPLOYED: Thu Mar  4 13:25:07 2021
NAMESPACE: monitoring
STATUS: deployed
REVISION: 1
NOTES:
kube-prometheus-stack has been installed. Check its status by running:
  kubectl --namespace monitoring get pods -l "release=prometheus"

Visit https://github.com/prometheus-operator/kube-prometheus for instructions on how to create & configure Alertmanager and Prometheus instances using the Operator.

видим что при запуске добавился label release=prometheus — проверяем:
kubectl describe pod prometheus-kube-prometheus-operator-659d5f8674-qxrf5 -n monitoring | grep -i release
release=prometheus

смотрим label на всех неймсмейсах:
kubectl get ns —show-labels

NAME                     STATUS   AGE     LABELS
default                  Active   192d    <none>
elk                      Active   63d     <none>
ingress-nginx            Active   192d    name=ingress-nginx
keda                     Active   86d     <none>
kube-node-lease          Active   192d    <none>
kube-public              Active   192d    <none>
kube-system              Active   192d    name=kube-system
m-logstash-megabuilder   Active   12d     <none>
monitoring               Active   3h15m   <none>
terminal-soft            Active   176d    <none>

проставим на них label release=prometheus
kubectl label namespace —all «prometheus=enabled»

проверяем:
kubectl get ns —show-labels

NAME                     STATUS   AGE     LABELS
default                  Active   192d    prometheus=enabled
elk                      Active   63d     prometheus=enabled
ingress-nginx            Active   192d    name=ingress-nginx,prometheus=enabled
keda                     Active   86d     prometheus=enabled
kube-node-lease          Active   192d    prometheus=enabled
kube-public              Active   192d    prometheus=enabled
kube-system              Active   192d    name=kube-system,prometheus=enabled
m-logstash-megabuilder   Active   12d     prometheus=enabled
monitoring               Active   3h16m   prometheus=enabled
terminal-soft            Active   176d    prometheus=enabled

теперь настроим сбор метрик с ingress controller,

создаём сервис для ingress. Указываем namespace в котором работает ingress, так же необходим label app.kubernetes.io/name: ingress-nginx данный лейб смотрим так:
kubectl describe pod -n ingress-nginx ingress-nginx-controller-vqjkl | grep -A3 Labels

Labels:               app.kubernetes.io/name=ingress-nginx
                      app.kubernetes.io/part-of=ingress-nginx
                      controller-revision-hash=bd6d56f49
                      pod-template-generation=1

mkdir exporter-ingres

cat exporter-ingres/service.yaml

apiVersion: v1
kind: Service
metadata:
  labels:
    app.kubernetes.io/name: ingress-nginx
    release: prometheus
  name: ingress-nginx
  namespace: ingress-nginx
spec:
  ports:
  - name: http
    port: 80
    protocol: TCP
    targetPort: 80
  - name: https
    port: 443
    protocol: TCP
    targetPort: 443
  - name: prometheus
    port: 10254
    protocol: TCP
    targetPort: 10254
  selector:
    app.kubernetes.io/name: ingress-nginx

В данном файле так же обращаем внимание на:
name: prometheus
на это имя будет натравлен port у ServiceMonitor

теперь создаём ServiceMonitor, он будет создавать в prometheus target с метриками ingress controller:

cat exporter-ingres/service-monitor.yaml

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  labels:
    app: ingress-nginx
    release: prometheus
  name: ingress-nginx
  namespace: monitoring
spec:
  endpoints:
  - honorLabels: true
    interval: 10s
    path: /metrics
    port: prometheus
    scheme: http
    scrapeTimeout: 10s
  namespaceSelector:
     any: true
  selector:
    matchLabels:
      app.kubernetes.io/name: ingress-nginx
      release: prometheus

также правим:

    ## Enable scraping /metrics/resource from kubelet's service
    ## This is disabled by default because container metrics are already exposed by cAdvisor
    ##
    resource: true

применяем:

[root@prod-vsrv-kubemaster1 charts]# kubectl apply -f exporter-ingres/service.yaml -f exporter-ingres/service-monitor.yaml

через пару минуток проверяем в prometheus

общий вид у файла values.yaml будет следующий:

cat helm-charts/charts/kube-prometheus-stack/values.yaml

# Default values for kube-prometheus-stack.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

## Provide a name in place of kube-prometheus-stack for `app:` labels
##
nameOverride: ""

## Override the deployment namespace
##
namespaceOverride: "monitoring"

## Provide a k8s version to auto dashboard import script example: kubeTargetVersionOverride: 1.16.6
##
kubeTargetVersionOverride: ""

## Provide a name to substitute for the full names of resources
##
fullnameOverride: ""

## Labels to apply to all resources
##
commonLabels: {}
# scmhash: abc123
# myLabel: aakkmd

## Create default rules for monitoring the cluster
##
defaultRules:
  create: true
  rules:
    alertmanager: true
    etcd: true
    general: true
    k8s: true
    kubeApiserver: true
    kubeApiserverAvailability: true
    kubeApiserverError: true
    kubeApiserverSlos: true
    kubelet: true
    kubePrometheusGeneral: true
    kubePrometheusNodeAlerting: true
    kubePrometheusNodeRecording: true
    kubernetesAbsent: true
    kubernetesApps: true
    kubernetesResources: true
    kubernetesStorage: true
    kubernetesSystem: true
    kubeScheduler: true
    kubeStateMetrics: true
    network: true
    node: true
    prometheus: true
    prometheusOperator: true
    time: true

  ## Runbook url prefix for default rules
  runbookUrl: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#
  ## Reduce app namespace alert scope
  appNamespacesTarget: ".*"

  ## Labels for default rules
  labels: {}
  ## Annotations for default rules
  annotations: {}

  ## Additional labels for PrometheusRule alerts
  additionalRuleLabels: {}

## Deprecated way to provide custom recording or alerting rules to be deployed into the cluster.
##
# additionalPrometheusRules: []
#  - name: my-rule-file
#    groups:
#      - name: my_group
#        rules:
#        - record: my_record
#          expr: 100 * my_record

## Provide custom recording or alerting rules to be deployed into the cluster.
##
additionalPrometheusRulesMap: {}
#  rule-name:
#    groups:
#    - name: my_group
#      rules:
#      - record: my_record
#        expr: 100 * my_record

##
global:
  rbac:
    create: true
    pspEnabled: true
    pspAnnotations: {}
      ## Specify pod annotations
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp
      ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl
      ##
      # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
      # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default'
      # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default'

  ## Reference to one or more secrets to be used when pulling images
  ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
  ##
  imagePullSecrets: []
  # - name: "image-pull-secret"

## Configuration for alertmanager
## ref: https://prometheus.io/docs/alerting/alertmanager/
##
alertmanager:

  ## Deploy alertmanager
  ##
  enabled: true

  ## Api that prometheus will use to communicate with alertmanager. Possible values are v1, v2
  ##
  apiVersion: v2

  ## Service account for Alertmanager to use.
  ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/
  ##
  serviceAccount:
    create: true
    name: ""
    annotations: {}

  ## Configure pod disruption budgets for Alertmanager
  ## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/#specifying-a-poddisruptionbudget
  ## This configuration is immutable once created and will require the PDB to be deleted to be changed
  ## https://github.com/kubernetes/kubernetes/issues/45398
  ##
  podDisruptionBudget:
    enabled: false
    minAvailable: 1
    maxUnavailable: ""

  ## Alertmanager configuration directives
  ## ref: https://prometheus.io/docs/alerting/configuration/#configuration-file
  ##      https://prometheus.io/webtools/alerting/routing-tree-editor/
  ##

  config:
    global:
      resolve_timeout: 5m
    route:
        receiver: 'telegram'
        routes:
        - match:
            severity: critical
          repeat_interval: 48h
          continue: true
          receiver: 'telegram'
        - match:
            alertname: Watchdog
          repeat_interval: 48h
          continue: true
          receiver: 'telegram'
    receivers:
        - name: 'telegram'
          webhook_configs:
              - send_resolved: true
                url: 'http://alertmanager-bot:8080'



#  config:
#    global:
#      resolve_timeout: 5m
#    route:
#      group_by: ['job']
#      group_wait: 30s
#      group_interval: 5m
#      repeat_interval: 12h
#      receiver: 'null'
#      routes:
#      - match:
#          alertname: Watchdog
#        receiver: 'null'
#    receivers:
#    - name: 'null'

    templates:
    - '/etc/alertmanager/config/*.tmpl'

  ## Pass the Alertmanager configuration directives through Helm's templating
  ## engine. If the Alertmanager configuration contains Alertmanager templates,
  ## they'll need to be properly escaped so that they are not interpreted by
  ## Helm
  ## ref: https://helm.sh/docs/developing_charts/#using-the-tpl-function
  ##      https://prometheus.io/docs/alerting/configuration/#tmpl_string
  ##      https://prometheus.io/docs/alerting/notifications/
  ##      https://prometheus.io/docs/alerting/notification_examples/
  tplConfig: false

  ## Alertmanager template files to format alerts
  ## By default, templateFiles are placed in /etc/alertmanager/config/ and if
  ## they have a .tmpl file suffix will be loaded. See config.templates above
  ## to change, add other suffixes. If adding other suffixes, be sure to update
  ## config.templates above to include those suffixes.
  ## ref: https://prometheus.io/docs/alerting/notifications/
  ##      https://prometheus.io/docs/alerting/notification_examples/
  ##
  templateFiles: {}
  #
  ## An example template:
  #   template_1.tmpl: |-
  #       {{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\.(.*)" "$1" }}{{ end }}
  #
  #       {{ define "slack.myorg.text" }}
  #       {{- $root := . -}}
  #       {{ range .Alerts }}
  #         *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
  #         *Cluster:*  {{ template "cluster" $root }}
  #         *Description:* {{ .Annotations.description }}
  #         *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:>
  #         *Runbook:* <{{ .Annotations.runbook }}|:spiral_note_pad:>
  #         *Details:*
  #           {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
  #           {{ end }}
  #       {{ end }}
  #       {{ end }}

  ingress:
    enabled: true

    # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
    # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
    # ingressClassName: nginx

    annotations: {}

    labels: {}

    ## Hosts must be provided if Ingress is enabled.
    ##
    hosts:
      - alertmanager.prod.test.local

    ## Paths to use for ingress rules - one path should match the alertmanagerSpec.routePrefix
    ##
    paths:
     - /

    ## For Kubernetes >= 1.18 you should specify the pathType (determines how Ingress paths should be matched)
    ## See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#better-path-matching-with-path-types
    # pathType: ImplementationSpecific

    ## TLS configuration for Alertmanager Ingress
    ## Secret must be manually created in the namespace
    ##
    tls: []
    # - secretName: alertmanager-general-tls
    #   hosts:
    #   - alertmanager.example.com

  ## Configuration for Alertmanager secret
  ##
  secret:
    annotations: {}

  ## Configuration for creating an Ingress that will map to each Alertmanager replica service
  ## alertmanager.servicePerReplica must be enabled
  ##
  ingressPerReplica:
    enabled: false

    # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
    # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
    # ingressClassName: nginx

    annotations: {}
    labels: {}

    ## Final form of the hostname for each per replica ingress is
    ## {{ ingressPerReplica.hostPrefix }}-{{ $replicaNumber }}.{{ ingressPerReplica.hostDomain }}
    ##
    ## Prefix for the per replica ingress that will have `-$replicaNumber`
    ## appended to the end
    hostPrefix: ""
    ## Domain that will be used for the per replica ingress
    hostDomain: ""

    ## Paths to use for ingress rules
    ##
    paths: []
    # - /

    ## For Kubernetes >= 1.18 you should specify the pathType (determines how Ingress paths should be matched)
    ## See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#better-path-matching-with-path-types
    # pathType: ImplementationSpecific

    ## Secret name containing the TLS certificate for alertmanager per replica ingress
    ## Secret must be manually created in the namespace
    tlsSecretName: ""

    ## Separated secret for each per replica Ingress. Can be used together with cert-manager
    ##
    tlsSecretPerReplica:
      enabled: false
      ## Final form of the secret for each per replica ingress is
      ## {{ tlsSecretPerReplica.prefix }}-{{ $replicaNumber }}
      ##
      prefix: "alertmanager"

  ## Configuration for Alertmanager service
  ##
  service:
    annotations: {}
    labels: {}
    clusterIP: ""

    ## Port for Alertmanager Service to listen on
    ##
    port: 9093
    ## To be used with a proxy extraContainer port
    ##
    targetPort: 9093
    ## Port to expose on each node
    ## Only used if service.type is 'NodePort'
    ##
    nodePort: 30903
    ## List of IP addresses at which the Prometheus server service is available
    ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
    ##

    ## Additional ports to open for Alertmanager service
    additionalPorts: []

    externalIPs: []
    loadBalancerIP: ""
    loadBalancerSourceRanges: []
    ## Service type
    ##
    type: ClusterIP

  ## Configuration for creating a separate Service for each statefulset Alertmanager replica
  ##
  servicePerReplica:
    enabled: false
    annotations: {}

    ## Port for Alertmanager Service per replica to listen on
    ##
    port: 9093

    ## To be used with a proxy extraContainer port
    targetPort: 9093

    ## Port to expose on each node
    ## Only used if servicePerReplica.type is 'NodePort'
    ##
    nodePort: 30904

    ## Loadbalancer source IP ranges
    ## Only used if servicePerReplica.type is "loadbalancer"
    loadBalancerSourceRanges: []
    ## Service type
    ##
    type: ClusterIP

  ## If true, create a serviceMonitor for alertmanager
  ##
  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    selfMonitor: true

    ## scheme: HTTP scheme to use for scraping. Can be used with `tlsConfig` for example if using istio mTLS.
    scheme: ""

    ## tlsConfig: TLS configuration to use when scraping the endpoint. For example if using istio mTLS.
    ## Of type: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#tlsconfig
    tlsConfig: {}

    bearerTokenFile:

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

  ## Settings affecting alertmanagerSpec
  ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#alertmanagerspec
  ##
  alertmanagerSpec:
    ## Standard object’s metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
    ## Metadata Labels and Annotations gets propagated to the Alertmanager pods.
    ##
    podMetadata: {}

    ## Image of Alertmanager
    ##
    image:
      repository: quay.io/prometheus/alertmanager
      tag: v0.21.0
      sha: ""

    ## If true then the user will be responsible to provide a secret with alertmanager configuration
    ## So when true the config part will be ignored (including templateFiles) and the one in the secret will be used
    ##
    useExistingSecret: false

    ## Secrets is a list of Secrets in the same namespace as the Alertmanager object, which shall be mounted into the
    ## Alertmanager Pods. The Secrets are mounted into /etc/alertmanager/secrets/.
    ##
    secrets: []

    ## ConfigMaps is a list of ConfigMaps in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods.
    ## The ConfigMaps are mounted into /etc/alertmanager/configmaps/.
    ##
    configMaps: []

    ## ConfigSecret is the name of a Kubernetes Secret in the same namespace as the Alertmanager object, which contains configuration for
    ## this Alertmanager instance. Defaults to 'alertmanager-' The secret is mounted into /etc/alertmanager/config.
    ##
    # configSecret:

    ## AlertmanagerConfigs to be selected to merge and configure Alertmanager with.
    ##
    alertmanagerConfigSelector: {}
    ## Example which selects all alertmanagerConfig resources
    ## with label "alertconfig" with values any of "example-config" or "example-config-2"
    # alertmanagerConfigSelector:
    #   matchExpressions:
    #     - key: alertconfig
    #       operator: In
    #       values:
    #         - example-config
    #         - example-config-2
    #
    ## Example which selects all alertmanagerConfig resources with label "role" set to "example-config"
    # alertmanagerConfigSelector:
    #   matchLabels:
    #     role: example-config

    ## Namespaces to be selected for AlertmanagerConfig discovery. If nil, only check own namespace.
    ##
    alertmanagerConfigNamespaceSelector: {}
    ## Example which selects all namespaces
    ## with label "alertmanagerconfig" with values any of "example-namespace" or "example-namespace-2"
    # alertmanagerConfigNamespaceSelector:
    #   matchExpressions:
    #     - key: alertmanagerconfig
    #       operator: In
    #       values:
    #         - example-namespace
    #         - example-namespace-2

    ## Example which selects all namespaces with label "alertmanagerconfig" set to "enabled"
    # alertmanagerConfigNamespaceSelector:
    #   matchLabels:
    #     alertmanagerconfig: enabled

    ## Define Log Format
    # Use logfmt (default) or json logging
    logFormat: logfmt

    ## Log level for Alertmanager to be configured with.
    ##
    logLevel: info

    ## Size is the expected size of the alertmanager cluster. The controller will eventually make the size of the
    ## running cluster equal to the expected size.
    replicas: 3

    ## Time duration Alertmanager shall retain data for. Default is '120h', and must match the regular expression
    ## [0-9]+(ms|s|m|h) (milliseconds seconds minutes hours).
    ##
    retention: 120h

    ## Storage is the definition of how storage will be used by the Alertmanager instances.
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/user-guides/storage.md
    ##
    storage:
     volumeClaimTemplate:
       spec:
         storageClassName: nfs-storageclass
         accessModes: ["ReadWriteMany"]
         resources:
           requests:
             storage: 10Gi
    #   selector: {}


    ##  The external URL the Alertmanager instances will be available under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name.     string  false
    ##
    externalUrl:

    ##  The route prefix Alertmanager registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true,
    ## but the server serves requests under a different route prefix. For example for use with kubectl proxy.
    ##
    routePrefix: /

    ## If set to true all actions on the underlying managed objects are not going to be performed, except for delete actions.
    ##
    paused: false

    ## Define which Nodes the Pods are scheduled on.
    ## ref: https://kubernetes.io/docs/user-guide/node-selection/
    ##
    nodeSelector: {}

    ## Define resources requests and limits for single Pods.
    ## ref: https://kubernetes.io/docs/user-guide/compute-resources/
    ##
    resources: {}
    # requests:
    #   memory: 400Mi

    ## Pod anti-affinity can prevent the scheduler from placing Prometheus replicas on the same node.
    ## The default value "soft" means that the scheduler should *prefer* to not schedule two replica pods onto the same node but no guarantee is provided.
    ## The value "hard" means that the scheduler is *required* to not schedule two replica pods onto the same node.
    ## The value "" will disable pod anti-affinity so that no anti-affinity rules will be configured.
    ##
    podAntiAffinity: ""

    ## If anti-affinity is enabled sets the topologyKey to use for anti-affinity.
    ## This can be changed to, for example, failure-domain.beta.kubernetes.io/zone
    ##
    podAntiAffinityTopologyKey: kubernetes.io/hostname

    ## Assign custom affinity rules to the alertmanager instance
    ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
    ##
    affinity: {}
    # nodeAffinity:
    #   requiredDuringSchedulingIgnoredDuringExecution:
    #     nodeSelectorTerms:
    #     - matchExpressions:
    #       - key: kubernetes.io/e2e-az-name
    #         operator: In
    #         values:
    #         - e2e-az1
    #         - e2e-az2

    ## If specified, the pod's tolerations.
    ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
    ##
    tolerations: []
    # - key: "key"
    #   operator: "Equal"
    #   value: "value"
    #   effect: "NoSchedule"

    ## If specified, the pod's topology spread constraints.
    ## ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/
    ##
    topologySpreadConstraints: []
    # - maxSkew: 1
    #   topologyKey: topology.kubernetes.io/zone
    #   whenUnsatisfiable: DoNotSchedule
    #   labelSelector:
    #     matchLabels:
    #       app: alertmanager

    ## SecurityContext holds pod-level security attributes and common container settings.
    ## This defaults to non root user with uid 1000 and gid 2000.       *v1.PodSecurityContext  false
    ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
    ##
    securityContext:
      runAsGroup: 2000
      runAsNonRoot: true
      runAsUser: 1000
      fsGroup: 2000

    ## ListenLocal makes the Alertmanager server listen on loopback, so that it does not bind against the Pod IP.
    ## Note this is only for the Alertmanager UI, not the gossip communication.
    ##
    listenLocal: false

    ## Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to an Alertmanager pod.
    ##
    containers: []

    # Additional volumes on the output StatefulSet definition.
    volumes: []

    # Additional VolumeMounts on the output StatefulSet definition.
    volumeMounts: []

    ## InitContainers allows injecting additional initContainers. This is meant to allow doing some changes
    ## (permissions, dir tree) on mounted volumes before starting prometheus
    initContainers: []

    ## Priority class assigned to the Pods
    ##
    priorityClassName: ""

    ## AdditionalPeers allows injecting a set of additional Alertmanagers to peer with to form a highly available cluster.
    ##
    additionalPeers: []

    ## PortName to use for Alert Manager.
    ##
    portName: "web"

    ## ClusterAdvertiseAddress is the explicit address to advertise in cluster. Needs to be provided for non RFC1918 [1] (public) addresses. [1] RFC1918: https://tools.ietf.org/html/rfc1918
    ##
    clusterAdvertiseAddress: false

    ## ForceEnableClusterMode ensures Alertmanager does not deactivate the cluster mode when running with a single replica.
    ## Use case is e.g. spanning an Alertmanager cluster across Kubernetes clusters with a single replica in each.
    forceEnableClusterMode: false


## Using default values from https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
##
grafana:
  enabled: true
  namespaceOverride: "monitoring"

  ## Deploy default dashboards.
  ##
  defaultDashboardsEnabled: true

  adminPassword: prom-operator

  ingress:
    ## If true, Grafana Ingress will be created
    ##
    enabled: true

    ## Annotations for Grafana Ingress
    ##
    annotations: {}
      # kubernetes.io/ingress.class: nginx
      # kubernetes.io/tls-acme: "true"

    ## Labels to be added to the Ingress
    ##
    labels: {}

    ## Hostnames.
    ## Must be provided if Ingress is enable.
    ##
    hosts:
      - grafana.prod.test.local
    #hosts: []

    ## Path for grafana ingress
    path: /

    ## TLS configuration for grafana Ingress
    ## Secret must be manually created in the namespace
    ##
    tls: []
    # - secretName: grafana-general-tls
    #   hosts:
    #   - grafana.example.com

  sidecar:
    dashboards:
      enabled: true
      label: grafana_dashboard

      ## Annotations for Grafana dashboard configmaps
      ##
      annotations: {}
      multicluster: false
    datasources:
      enabled: true
      defaultDatasourceEnabled: true

      # If not defined, will use prometheus.prometheusSpec.scrapeInterval or its default
      # defaultDatasourceScrapeInterval: 15s

      ## Annotations for Grafana datasource configmaps
      ##
      annotations: {}

      ## Create datasource for each Pod of Prometheus StatefulSet;
      ## this uses headless service `prometheus-operated` which is
      ## created by Prometheus Operator
      ## ref: https://git.io/fjaBS
      createPrometheusReplicasDatasources: false
      label: grafana_datasource

  extraConfigmapMounts: []
  # - name: certs-configmap
  #   mountPath: /etc/grafana/ssl/
  #   configMap: certs-configmap
  #   readOnly: true

  ## Configure additional grafana datasources (passed through tpl)
  ## ref: http://docs.grafana.org/administration/provisioning/#datasources
  additionalDataSources: []
  # - name: prometheus-sample
  #   access: proxy
  #   basicAuth: true
  #   basicAuthPassword: pass
  #   basicAuthUser: daco
  #   editable: false
  #   jsonData:
  #       tlsSkipVerify: true
  #   orgId: 1
  #   type: prometheus
  #   url: https://{{ printf "%s-prometheus.svc" .Release.Name }}:9090
  #   version: 1

  ## Passed to grafana subchart and used by servicemonitor below
  ##
  service:
    portName: service

  ## If true, create a serviceMonitor for grafana
  ##
  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    selfMonitor: true

    # Path to use for scraping metrics. Might be different if server.root_url is set
    # in grafana.ini
    path: "/metrics"

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

## Component scraping the kube api server
##
kubeApiServer:
  enabled: true
  tlsConfig:
    serverName: kubernetes
    insecureSkipVerify: false

  ## If your API endpoint address is not reachable (as in AKS) you can replace it with the kubernetes service
  ##
  relabelings: []
  # - sourceLabels:
  #     - __meta_kubernetes_namespace
  #     - __meta_kubernetes_service_name
  #     - __meta_kubernetes_endpoint_port_name
  #   action: keep
  #   regex: default;kubernetes;https
  # - targetLabel: __address__
  #   replacement: kubernetes.default.svc:443

  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    jobLabel: component
    selector:
      matchLabels:
        component: apiserver
        provider: kubernetes

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

## Component scraping the kubelet and kubelet-hosted cAdvisor
##
kubelet:
  enabled: true
  namespace: kube-system

  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""

    ## Enable scraping the kubelet over https. For requirements to enable this see
    ## https://github.com/prometheus-operator/prometheus-operator/issues/926
    ##
    https: true

    ## Enable scraping /metrics/cadvisor from kubelet's service
    ##
    cAdvisor: true

    ## Enable scraping /metrics/probes from kubelet's service
    ##
    probes: true

    ## Enable scraping /metrics/resource from kubelet's service
    ## This is disabled by default because container metrics are already exposed by cAdvisor
    ##
    resource: true
    # From kubernetes 1.18, /metrics/resource/v1alpha1 renamed to /metrics/resource
    resourcePath: "/metrics/resource/v1alpha1"
    ## Metric relabellings to apply to samples before ingestion
    ##
    cAdvisorMetricRelabelings: []
    # - sourceLabels: [__name__, image]
    #   separator: ;
    #   regex: container_([a-z_]+);
    #   replacement: $1
    #   action: drop
    # - sourceLabels: [__name__]
    #   separator: ;
    #   regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
    #   replacement: $1
    #   action: drop

    ## Metric relabellings to apply to samples before ingestion
    ##
    probesMetricRelabelings: []
    # - sourceLabels: [__name__, image]
    #   separator: ;
    #   regex: container_([a-z_]+);
    #   replacement: $1
    #   action: drop
    # - sourceLabels: [__name__]
    #   separator: ;
    #   regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
    #   replacement: $1
    #   action: drop

    #   relabel configs to apply to samples before ingestion.
    #   metrics_path is required to match upstream rules and charts
    ##
    cAdvisorRelabelings:
      - sourceLabels: [__metrics_path__]
        targetLabel: metrics_path
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

    probesRelabelings:
      - sourceLabels: [__metrics_path__]
        targetLabel: metrics_path
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

    resourceRelabelings:
      - sourceLabels: [__metrics_path__]
        targetLabel: metrics_path
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

    metricRelabelings: []
    # - sourceLabels: [__name__, image]
    #   separator: ;
    #   regex: container_([a-z_]+);
    #   replacement: $1
    #   action: drop
    # - sourceLabels: [__name__]
    #   separator: ;
    #   regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
    #   replacement: $1
    #   action: drop

    #   relabel configs to apply to samples before ingestion.
    #   metrics_path is required to match upstream rules and charts
    ##
    relabelings:
      - sourceLabels: [__metrics_path__]
        targetLabel: metrics_path
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

## Component scraping the kube controller manager
##
kubeControllerManager:
  enabled: true

  ## If your kube controller manager is not deployed as a pod, specify IPs it can be found on
  ##
  endpoints: []
  # - 10.141.4.22
  # - 10.141.4.23
  # - 10.141.4.24

  ## If using kubeControllerManager.endpoints only the port and targetPort are used
  ##
  service:
    port: 10252
    targetPort: 10252
    selector:
      k8s-app: kube-controller-manager
    #   component: kube-controller-manager

  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""

    ## Enable scraping kube-controller-manager over https.
    ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
    ##
    https: false

    # Skip TLS certificate validation when scraping
    insecureSkipVerify: null

    # Name of the server to use when validating TLS certificate
    serverName: null

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

## Component scraping coreDns. Use either this or kubeDns
##
coreDns:
  enabled: true
  service:
    port: 9153
    targetPort: 9153
    # selector:
    #   k8s-app: kube-dns
  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

## Component scraping kubeDns. Use either this or coreDns
##
kubeDns:
  enabled: false
  service:
    dnsmasq:
      port: 10054
      targetPort: 10054
    skydns:
      port: 10055
      targetPort: 10055
    # selector:
    #   k8s-app: kube-dns
  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace
    dnsmasqMetricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    dnsmasqRelabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

## Component scraping etcd
##
kubeEtcd:
  enabled: true

  ## If your etcd is not deployed as a pod, specify IPs it can be found on
  ##
  endpoints: []
  # - 10.141.4.22
  # - 10.141.4.23
  # - 10.141.4.24

  ## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used
  ##
  service:
    port: 2379
    targetPort: 2379
    # selector:
    #   component: etcd

  ## Configure secure access to the etcd cluster by loading a secret into prometheus and
  ## specifying security configuration below. For example, with a secret named etcd-client-cert
  ##
  ## serviceMonitor:
  ##   scheme: https
  ##   insecureSkipVerify: false
  ##   serverName: localhost
  ##   caFile: /etc/prometheus/secrets/etcd-client-cert/etcd-ca
  ##   certFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client
  ##   keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key
  ##
  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    scheme: http
    insecureSkipVerify: false
    serverName: ""
    caFile: ""
    certFile: ""
    keyFile: ""

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace


## Component scraping kube scheduler
##
kubeScheduler:
  enabled: true

  ## If your kube scheduler is not deployed as a pod, specify IPs it can be found on
  ##
  endpoints: []
  # - 10.141.4.22
  # - 10.141.4.23
  # - 10.141.4.24

  ## If using kubeScheduler.endpoints only the port and targetPort are used
  ##
  service:
    port: 10251
    targetPort: 10251
    selector:
      k8s-app: kube-scheduler
    #   component: kube-scheduler

  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    ## Enable scraping kube-scheduler over https.
    ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
    ##
    https: false

    ## Skip TLS certificate validation when scraping
    insecureSkipVerify: null

    ## Name of the server to use when validating TLS certificate
    serverName: null

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace


## Component scraping kube proxy
##
kubeProxy:
  enabled: true

  ## If your kube proxy is not deployed as a pod, specify IPs it can be found on
  ##
  endpoints: []
  # - 10.141.4.22
  # - 10.141.4.23
  # - 10.141.4.24

  service:
    port: 10249
    targetPort: 10249
    # selector:
    #   k8s-app: kube-proxy

  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""

    ## Enable scraping kube-proxy over https.
    ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
    ##
    https: false

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]


## Component scraping kube state metrics
##
kubeStateMetrics:
  enabled: true
  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    ## Override serviceMonitor selector
    ##
    selectorOverride: {}

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

## Configuration for kube-state-metrics subchart
##
kube-state-metrics:
  namespaceOverride: "monitoring"
  rbac:
    create: true
  podSecurityPolicy:
    enabled: true

## Deploy node exporter as a daemonset to all nodes
##
nodeExporter:
  enabled: true

  ## Use the value configured in prometheus-node-exporter.podLabels
  ##
  jobLabel: jobLabel

  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""

    ## How long until a scrape request times out. If not set, the Prometheus default scape timeout is used.
    ##
    scrapeTimeout: ""

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - sourceLabels: [__name__]
    #   separator: ;
    #   regex: ^node_mountstats_nfs_(event|operations|transport)_.+
    #   replacement: $1
    #   action: drop

    ##  relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

## Configuration for prometheus-node-exporter subchart
##
prometheus-node-exporter:
  namespaceOverride: "monitoring"
  podLabels:
    ## Add the 'node-exporter' label to be used by serviceMonitor to match standard common usage in rules and grafana dashboards
    ##
    jobLabel: node-exporter
  extraArgs:
    - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)
    - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$

## Manages Prometheus and Alertmanager components
##
prometheusOperator:
  enabled: true

  ## Prometheus-Operator v0.39.0 and later support TLS natively.
  ##
  tls:
    enabled: true
    # Value must match version names from https://golang.org/pkg/crypto/tls/#pkg-constants
    tlsMinVersion: VersionTLS13
    # The default webhook port is 10250 in order to work out-of-the-box in GKE private clusters and avoid adding firewall rules.
    internalPort: 10250

  ## Admission webhook support for PrometheusRules resources added in Prometheus Operator 0.30 can be enabled to prevent incorrectly formatted
  ## rules from making their way into prometheus and potentially preventing the container from starting
  admissionWebhooks:
    failurePolicy: Fail
    enabled: true
    ## A PEM encoded CA bundle which will be used to validate the webhook's server certificate.
    ## If unspecified, system trust roots on the apiserver are used.
    caBundle: ""
    ## If enabled, generate a self-signed certificate, then patch the webhook configurations with the generated data.
    ## On chart upgrades (or if the secret exists) the cert will not be re-generated. You can use this to provide your own
    ## certs ahead of time if you wish.
    ##
    patch:
      enabled: true
      image:
        repository: jettech/kube-webhook-certgen
        tag: v1.5.0
        sha: ""
        pullPolicy: IfNotPresent
      resources: {}
      ## Provide a priority class name to the webhook patching job
      ##
      priorityClassName: ""
      podAnnotations: {}
      nodeSelector: {}
      affinity: {}
      tolerations: []
    # Use certmanager to generate webhook certs
    certManager:
      enabled: false
      # issuerRef:
      #   name: "issuer"
      #   kind: "ClusterIssuer"

  ## Namespaces to scope the interaction of the Prometheus Operator and the apiserver (allow list).
  ## This is mutually exclusive with denyNamespaces. Setting this to an empty object will disable the configuration
  ##
  namespaces: {}
    # releaseNamespace: true
    # additional:
    # - kube-system

  ## Namespaces not to scope the interaction of the Prometheus Operator (deny list).
  ##
  denyNamespaces: []

  ## Filter namespaces to look for prometheus-operator custom resources
  ##
  alertmanagerInstanceNamespaces: []
  prometheusInstanceNamespaces: []
  thanosRulerInstanceNamespaces: []

  ## The clusterDomain value will be added to the cluster.peer option of the alertmanager.
  ## Without this specified option cluster.peer will have value alertmanager-monitoring-alertmanager-0.alertmanager-operated:9094 (default value)
  ## With this specified option cluster.peer will have value alertmanager-monitoring-alertmanager-0.alertmanager-operated.namespace.svc.cluster-domain:9094
  ##
  # clusterDomain: "cluster.local"

  ## Service account for Alertmanager to use.
  ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/
  ##
  serviceAccount:
    create: true
    name: ""

  ## Configuration for Prometheus operator service
  ##
  service:
    annotations: {}
    labels: {}
    clusterIP: ""

  ## Port to expose on each node
  ## Only used if service.type is 'NodePort'
  ##
    nodePort: 30080

    nodePortTls: 30443

  ## Additional ports to open for Prometheus service
  ## ref: https://kubernetes.io/docs/concepts/services-networking/service/#multi-port-services
  ##
    additionalPorts: []

  ## Loadbalancer IP
  ## Only use if service.type is "loadbalancer"
  ##
    loadBalancerIP: ""
    loadBalancerSourceRanges: []

  ## Service type
  ## NodePort, ClusterIP, loadbalancer
  ##
    type: ClusterIP

    ## List of IP addresses at which the Prometheus server service is available
    ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
    ##
    externalIPs: []

  ## Labels to add to the operator pod
  ##
  podLabels: {}

  ## Annotations to add to the operator pod
  ##
  podAnnotations: {}

  ## Assign a PriorityClassName to pods if set
  # priorityClassName: ""

  ## Define Log Format
  # Use logfmt (default) or json logging
  # logFormat: logfmt

  ## Decrease log verbosity to errors only
  # logLevel: error

  ## If true, the operator will create and maintain a service for scraping kubelets
  ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/helm/prometheus-operator/README.md
  ##
  kubeletService:
    enabled: true
    namespace: kube-system

  ## Create a servicemonitor for the operator
  ##
  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    ## Scrape timeout. If not set, the Prometheus default scrape timeout is used.
    scrapeTimeout: ""
    selfMonitor: true

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

  ## Resource limits & requests
  ##
  resources: {}
  # limits:
  #   cpu: 200m
  #   memory: 200Mi
  # requests:
  #   cpu: 100m
  #   memory: 100Mi

  # Required for use in managed kubernetes clusters (such as AWS EKS) with custom CNI (such as calico),
  # because control-plane managed by AWS cannot communicate with pods' IP CIDR and admission webhooks are not working
  ##
  hostNetwork: false

  ## Define which Nodes the Pods are scheduled on.
  ## ref: https://kubernetes.io/docs/user-guide/node-selection/
  ##
  nodeSelector: {}

  ## Tolerations for use with node taints
  ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
  ##
  tolerations: []
  # - key: "key"
  #   operator: "Equal"
  #   value: "value"
  #   effect: "NoSchedule"

  ## Assign custom affinity rules to the prometheus operator
  ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
  ##
  affinity: {}
    # nodeAffinity:
    #   requiredDuringSchedulingIgnoredDuringExecution:
    #     nodeSelectorTerms:
    #     - matchExpressions:
    #       - key: kubernetes.io/e2e-az-name
    #         operator: In
    #         values:
    #         - e2e-az1
    #         - e2e-az2
  dnsConfig: {}
    # nameservers:
    #   - 1.2.3.4
    # searches:
    #   - ns1.svc.cluster-domain.example
    #   - my.dns.search.suffix
    # options:
    #   - name: ndots
    #     value: "2"
  #   - name: edns0
  securityContext:
    fsGroup: 65534
    runAsGroup: 65534
    runAsNonRoot: true
    runAsUser: 65534

  ## Prometheus-operator image
  ##
  image:
    repository: quay.io/prometheus-operator/prometheus-operator
    tag: v0.45.0
    sha: ""
    pullPolicy: IfNotPresent

  ## Prometheus image to use for prometheuses managed by the operator
  ##
  # prometheusDefaultBaseImage: quay.io/prometheus/prometheus

  ## Alertmanager image to use for alertmanagers managed by the operator
  ##
  # alertmanagerDefaultBaseImage: quay.io/prometheus/alertmanager

  ## Prometheus-config-reloader image to use for config and rule reloading
  ##
  prometheusConfigReloaderImage:
    repository: quay.io/prometheus-operator/prometheus-config-reloader
    tag: v0.45.0
    sha: ""

  ## Set the prometheus config reloader side-car CPU limit
  ##
  configReloaderCpu: 100m

  ## Set the prometheus config reloader side-car memory limit
  ##
  configReloaderMemory: 50Mi

  ## Set a Field Selector to filter watched secrets
  ##
  secretFieldSelector: ""

## Deploy a Prometheus instance
##
prometheus:

  enabled: true

  ## Annotations for Prometheus
  ##
  annotations: {}

  ## Service account for Prometheuses to use.
  ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/
  ##
  serviceAccount:
    create: true
    name: ""

  # Service for thanos service discovery on sidecar
  # Enable this can make Thanos Query can use
  # `--store=dnssrv+_grpc._tcp.${kube-prometheus-stack.fullname}-thanos-discovery.${namespace}.svc.cluster.local` to discovery
  # Thanos sidecar on prometheus nodes
  # (Please remember to change ${kube-prometheus-stack.fullname} and ${namespace}. Not just copy and paste!)
  thanosService:
    enabled: false
    annotations: {}
    labels: {}
    portName: grpc
    port: 10901
    targetPort: "grpc"

  ## Configuration for Prometheus service
  ##
  service:
    annotations: {}
    labels: {}
    clusterIP: ""

    ## Port for Prometheus Service to listen on
    ##
    port: 9090

    ## To be used with a proxy extraContainer port
    targetPort: 9090

    ## List of IP addresses at which the Prometheus server service is available
    ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
    ##
    externalIPs: []

    ## Port to expose on each node
    ## Only used if service.type is 'NodePort'
    ##
    nodePort: 30090

    ## Loadbalancer IP
    ## Only use if service.type is "loadbalancer"
    loadBalancerIP: ""
    loadBalancerSourceRanges: []
    ## Service type
    ##
    type: ClusterIP

    sessionAffinity: ""

  ## Configuration for creating a separate Service for each statefulset Prometheus replica
  ##
  servicePerReplica:
    enabled: false
    annotations: {}

    ## Port for Prometheus Service per replica to listen on
    ##
    port: 9090

    ## To be used with a proxy extraContainer port
    targetPort: 9090

    ## Port to expose on each node
    ## Only used if servicePerReplica.type is 'NodePort'
    ##
    nodePort: 30091

    ## Loadbalancer source IP ranges
    ## Only used if servicePerReplica.type is "loadbalancer"
    loadBalancerSourceRanges: []
    ## Service type
    ##
    type: ClusterIP

  ## Configure pod disruption budgets for Prometheus
  ## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/#specifying-a-poddisruptionbudget
  ## This configuration is immutable once created and will require the PDB to be deleted to be changed
  ## https://github.com/kubernetes/kubernetes/issues/45398
  ##
  podDisruptionBudget:
    enabled: false
    minAvailable: 1
    maxUnavailable: ""

  # Ingress exposes thanos sidecar outside the cluster
  thanosIngress:
    enabled: false

    # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
    # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
    # ingressClassName: nginx

    annotations: {}
    labels: {}
    servicePort: 10901

    ## Port to expose on each node
    ## Only used if service.type is 'NodePort'
    ##
    nodePort: 30901

    ## Hosts must be provided if Ingress is enabled.
    ##
    hosts: []
      # - thanos-gateway.domain.com

    ## Paths to use for ingress rules
    ##
    paths: []
    # - /

    ## For Kubernetes >= 1.18 you should specify the pathType (determines how Ingress paths should be matched)
    ## See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#better-path-matching-with-path-types
    # pathType: ImplementationSpecific

    ## TLS configuration for Thanos Ingress
    ## Secret must be manually created in the namespace
    ##
    tls: []
    # - secretName: thanos-gateway-tls
    #   hosts:
    #   - thanos-gateway.domain.com

  ingress:
    enabled: true

    # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
    # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
    # ingressClassName: nginx

    annotations: {}
    labels: {}

    ## Hostnames.
    ## Must be provided if Ingress is enabled.
    ##
    hosts:
      - prometheus.prod.test.local
    #hosts: []

    ## Paths to use for ingress rules - one path should match the prometheusSpec.routePrefix
    ##
    paths:
     - /

    ## For Kubernetes >= 1.18 you should specify the pathType (determines how Ingress paths should be matched)
    ## See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#better-path-matching-with-path-types
    # pathType: ImplementationSpecific

    ## TLS configuration for Prometheus Ingress
    ## Secret must be manually created in the namespace
    ##
    tls: []
      # - secretName: prometheus-general-tls
      #   hosts:
      #     - prometheus.example.com

  ## Configuration for creating an Ingress that will map to each Prometheus replica service
  ## prometheus.servicePerReplica must be enabled
  ##
  ingressPerReplica:
    enabled: false

    # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
    # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
    # ingressClassName: nginx

    annotations: {}
    labels: {}

    ## Final form of the hostname for each per replica ingress is
    ## {{ ingressPerReplica.hostPrefix }}-{{ $replicaNumber }}.{{ ingressPerReplica.hostDomain }}
    ##
    ## Prefix for the per replica ingress that will have `-$replicaNumber`
    ## appended to the end
    hostPrefix: ""
    ## Domain that will be used for the per replica ingress
    hostDomain: ""

    ## Paths to use for ingress rules
    ##
    paths: []
    # - /

    ## For Kubernetes >= 1.18 you should specify the pathType (determines how Ingress paths should be matched)
    ## See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#better-path-matching-with-path-types
    # pathType: ImplementationSpecific

    ## Secret name containing the TLS certificate for Prometheus per replica ingress
    ## Secret must be manually created in the namespace
    tlsSecretName: ""

    ## Separated secret for each per replica Ingress. Can be used together with cert-manager
    ##
    tlsSecretPerReplica:
      enabled: false
      ## Final form of the secret for each per replica ingress is
      ## {{ tlsSecretPerReplica.prefix }}-{{ $replicaNumber }}
      ##
      prefix: "prometheus"

  ## Configure additional options for default pod security policy for Prometheus
  ## ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/
  podSecurityPolicy:
    allowedCapabilities: []
    allowedHostPaths: []
    volumes: []

  serviceMonitor:
    ## Scrape interval. If not set, the Prometheus default scrape interval is used.
    ##
    interval: ""
    selfMonitor: true

    ## scheme: HTTP scheme to use for scraping. Can be used with `tlsConfig` for example if using istio mTLS.
    scheme: ""

    ## tlsConfig: TLS configuration to use when scraping the endpoint. For example if using istio mTLS.
    ## Of type: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#tlsconfig
    tlsConfig: {}

    bearerTokenFile:

    ##  metric relabel configs to apply to samples before ingestion.
    ##
    metricRelabelings: []
    # - action: keep
    #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
    #   sourceLabels: [__name__]

    #   relabel configs to apply to samples before ingestion.
    ##
    relabelings: []
    # - sourceLabels: [__meta_kubernetes_pod_node_name]
    #   separator: ;
    #   regex: ^(.*)$
    #   targetLabel: nodename
    #   replacement: $1
    #   action: replace

  ## Settings affecting prometheusSpec
  ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#prometheusspec
  ##
  prometheusSpec:
    ## If true, pass --storage.tsdb.max-block-duration=2h to prometheus. This is already done if using Thanos
    ##
    disableCompaction: false
    ## APIServerConfig
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#apiserverconfig
    ##
    apiserverConfig: {}

    ## Interval between consecutive scrapes.
    ## Defaults to 30s.
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/release-0.44/pkg/prometheus/promcfg.go#L180-L183
    ##
    scrapeInterval: ""

    ## Number of seconds to wait for target to respond before erroring
    ##
    scrapeTimeout: ""

    ## Interval between consecutive evaluations.
    ##
    evaluationInterval: ""

    ## ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP.
    ##
    listenLocal: false

    ## EnableAdminAPI enables Prometheus the administrative HTTP API which includes functionality such as deleting time series.
    ## This is disabled by default.
    ## ref: https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis
    ##
    enableAdminAPI: false

    ## Image of Prometheus.
    ##
    image:
      repository: quay.io/prometheus/prometheus
      tag: v2.24.0
      sha: ""

    ## Tolerations for use with node taints
    ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
    ##
    tolerations: []
    #  - key: "key"
    #    operator: "Equal"
    #    value: "value"
    #    effect: "NoSchedule"

    ## If specified, the pod's topology spread constraints.
    ## ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/
    ##
    topologySpreadConstraints: []
    # - maxSkew: 1
    #   topologyKey: topology.kubernetes.io/zone
    #   whenUnsatisfiable: DoNotSchedule
    #   labelSelector:
    #     matchLabels:
    #       app: prometheus

    ## Alertmanagers to which alerts will be sent
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#alertmanagerendpoints
    ##
    ## Default configuration will connect to the alertmanager deployed as part of this release
    ##
    alertingEndpoints: []
    # - name: ""
    #   namespace: ""
    #   port: http
    #   scheme: http
    #   pathPrefix: ""
    #   tlsConfig: {}
    #   bearerTokenFile: ""
    #   apiVersion: v2

    ## External labels to add to any time series or alerts when communicating with external systems
    ##
    externalLabels: {}

    ## Name of the external label used to denote replica name
    ##
    replicaExternalLabelName: ""

    ## If true, the Operator won't add the external label used to denote replica name
    ##
    replicaExternalLabelNameClear: false

    ## Name of the external label used to denote Prometheus instance name
    ##
    prometheusExternalLabelName: ""

    ## If true, the Operator won't add the external label used to denote Prometheus instance name
    ##
    prometheusExternalLabelNameClear: false

    ## External URL at which Prometheus will be reachable.
    ##
    externalUrl: ""

    ## Define which Nodes the Pods are scheduled on.
    ## ref: https://kubernetes.io/docs/user-guide/node-selection/
    ##
    nodeSelector: {}

    ## Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods.
    ## The Secrets are mounted into /etc/prometheus/secrets/. Secrets changes after initial creation of a Prometheus object are not
    ## reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated
    ## with the new list of secrets.
    ##
    secrets: []

    ## ConfigMaps is a list of ConfigMaps in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods.
    ## The ConfigMaps are mounted into /etc/prometheus/configmaps/.
    ##
    configMaps: []

    ## QuerySpec defines the query command line flags when starting Prometheus.
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#queryspec
    ##
    query: {}

    ## Namespaces to be selected for PrometheusRules discovery.
    ## If nil, select own namespace. Namespaces to be selected for ServiceMonitor discovery.
    ## See https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#namespaceselector for usage
    ##
    ruleNamespaceSelector: {}

    ## If true, a nil or {} value for prometheus.prometheusSpec.ruleSelector will cause the
    ## prometheus resource to be created with selectors based on values in the helm deployment,
    ## which will also match the PrometheusRule resources created
    ##
    ruleSelectorNilUsesHelmValues: true

    ## PrometheusRules to be selected for target discovery.
    ## If {}, select all ServiceMonitors
    ##
    ruleSelector: {}
    ## Example which select all prometheusrules resources
    ## with label "prometheus" with values any of "example-rules" or "example-rules-2"
    # ruleSelector:
    #   matchExpressions:
    #     - key: prometheus
    #       operator: In
    #       values:
    #         - example-rules
    #         - example-rules-2
    #
    ## Example which select all prometheusrules resources with label "role" set to "example-rules"
    # ruleSelector:
    #   matchLabels:
    #     role: example-rules

    ## If true, a nil or {} value for prometheus.prometheusSpec.serviceMonitorSelector will cause the
    ## prometheus resource to be created with selectors based on values in the helm deployment,
    ## which will also match the servicemonitors created
    ##
    serviceMonitorSelectorNilUsesHelmValues: true

    ## ServiceMonitors to be selected for target discovery.
    ## If {}, select all ServiceMonitors
    ##
    serviceMonitorSelector: {}
    ## Example which selects ServiceMonitors with label "prometheus" set to "somelabel"
    # serviceMonitorSelector:
    #   matchLabels:
    #     prometheus: somelabel

    ## Namespaces to be selected for ServiceMonitor discovery.
    ##
    serviceMonitorNamespaceSelector:
       matchLabels:
         prometheus: enabled

    ## Example which selects ServiceMonitors in namespaces with label "prometheus" set to "somelabel"
    # serviceMonitorNamespaceSelector:
    #   matchLabels:
    #     prometheus: somelabel

    ## If true, a nil or {} value for prometheus.prometheusSpec.podMonitorSelector will cause the
    ## prometheus resource to be created with selectors based on values in the helm deployment,
    ## which will also match the podmonitors created
    ##
    podMonitorSelectorNilUsesHelmValues: true

    ## PodMonitors to be selected for target discovery.
    ## If {}, select all PodMonitors
    ##
    podMonitorSelector: {}
    ## Example which selects PodMonitors with label "prometheus" set to "somelabel"
    # podMonitorSelector:
    #   matchLabels:
    #     prometheus: somelabel

    ## Namespaces to be selected for PodMonitor discovery.
    ## See https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#namespaceselector for usage
    ##
    podMonitorNamespaceSelector: {}

    ## If true, a nil or {} value for prometheus.prometheusSpec.probeSelector will cause the
    ## prometheus resource to be created with selectors based on values in the helm deployment,
    ## which will also match the probes created
    ##
    probeSelectorNilUsesHelmValues: true

    ## Probes to be selected for target discovery.
    ## If {}, select all Probes
    ##
    probeSelector: {}
    ## Example which selects Probes with label "prometheus" set to "somelabel"
    # probeSelector:
    #   matchLabels:
    #     prometheus: somelabel

    ## Namespaces to be selected for Probe discovery.
    ## See https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#namespaceselector for usage
    ##
    probeNamespaceSelector: {}

    ## How long to retain metrics
    ##
    retention: 10d

    ## Maximum size of metrics
    ##
    retentionSize: ""

    ## Enable compression of the write-ahead log using Snappy.
    ##
    walCompression: false

    ## If true, the Operator won't process any Prometheus configuration changes
    ##
    paused: false

    ## Number of replicas of each shard to deploy for a Prometheus deployment.
    ## Number of replicas multiplied by shards is the total number of Pods created.
    ##
    replicas: 1

    ## EXPERIMENTAL: Number of shards to distribute targets onto.
    ## Number of replicas multiplied by shards is the total number of Pods created.
    ## Note that scaling down shards will not reshard data onto remaining instances, it must be manually moved.
    ## Increasing shards will not reshard data either but it will continue to be available from the same instances.
    ## To query globally use Thanos sidecar and Thanos querier or remote write data to a central location.
    ## Sharding is done on the content of the `__address__` target meta-label.
    ##
    shards: 1

    ## Log level for Prometheus be configured in
    ##
    logLevel: info

    ## Log format for Prometheus be configured in
    ##
    logFormat: logfmt

    ## Prefix used to register routes, overriding externalUrl route.
    ## Useful for proxies that rewrite URLs.
    ##
    routePrefix: /

    ## Standard object’s metadata. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata
    ## Metadata Labels and Annotations gets propagated to the prometheus pods.
    ##
    podMetadata: {}
    # labels:
    #   app: prometheus
    #   k8s-app: prometheus

    ## Pod anti-affinity can prevent the scheduler from placing Prometheus replicas on the same node.
    ## The default value "soft" means that the scheduler should *prefer* to not schedule two replica pods onto the same node but no guarantee is provided.
    ## The value "hard" means that the scheduler is *required* to not schedule two replica pods onto the same node.
    ## The value "" will disable pod anti-affinity so that no anti-affinity rules will be configured.
    podAntiAffinity: ""

    ## If anti-affinity is enabled sets the topologyKey to use for anti-affinity.
    ## This can be changed to, for example, failure-domain.beta.kubernetes.io/zone
    ##
    podAntiAffinityTopologyKey: kubernetes.io/hostname

    ## Assign custom affinity rules to the prometheus instance
    ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
    ##
    affinity: {}
    # nodeAffinity:
    #   requiredDuringSchedulingIgnoredDuringExecution:
    #     nodeSelectorTerms:
    #     - matchExpressions:
    #       - key: kubernetes.io/e2e-az-name
    #         operator: In
    #         values:
    #         - e2e-az1
    #         - e2e-az2

    ## The remote_read spec configuration for Prometheus.
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#remotereadspec
    remoteRead: []
    # - url: http://remote1/read

    ## The remote_write spec configuration for Prometheus.
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#remotewritespec
    remoteWrite: []
    # - url: http://remote1/push

    ## Enable/Disable Grafana dashboards provisioning for prometheus remote write feature
    remoteWriteDashboards: false

    ## Resource limits & requests
    ##
    resources: {}
    # requests:
    #   memory: 400Mi

    ## Prometheus StorageSpec for persistent data
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/user-guides/storage.md
    ##
    storageSpec:
    ## Using PersistentVolumeClaim
    ##
      volumeClaimTemplate:
        spec:
          storageClassName: nfs-storageclass
          accessModes: ["ReadWriteMany"]
          resources:
            requests:
              storage: 10Gi
    #    selector: {}

    ## Using tmpfs volume
    ##
    #  emptyDir:
    #    medium: Memory

    # Additional volumes on the output StatefulSet definition.
    volumes: []

    # Additional VolumeMounts on the output StatefulSet definition.
    volumeMounts: []

    ## AdditionalScrapeConfigs allows specifying additional Prometheus scrape configurations. Scrape configurations
    ## are appended to the configurations generated by the Prometheus Operator. Job configurations must have the form
    ## as specified in the official Prometheus documentation:
    ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config. As scrape configs are
    ## appended, the user is responsible to make sure it is valid. Note that using this feature may expose the possibility
    ## to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible
    ## scrape configs are going to break Prometheus after the upgrade.
    ##
    ## The scrape configuration example below will find master nodes, provided they have the name .*mst.*, relabel the
    ## port to 2379 and allow etcd scraping provided it is running on all Kubernetes master nodes
    ##
    additionalScrapeConfigs: []
    # - job_name: kube-etcd
    #   kubernetes_sd_configs:
    #     - role: node
    #   scheme: https
    #   tls_config:
    #     ca_file:   /etc/prometheus/secrets/etcd-client-cert/etcd-ca
    #     cert_file: /etc/prometheus/secrets/etcd-client-cert/etcd-client
    #     key_file:  /etc/prometheus/secrets/etcd-client-cert/etcd-client-key
    #   relabel_configs:
    #   - action: labelmap
    #     regex: __meta_kubernetes_node_label_(.+)
    #   - source_labels: [__address__]
    #     action: replace
    #     targetLabel: __address__
    #     regex: ([^:;]+):(d+)
    #     replacement: ${1}:2379
    #   - source_labels: [__meta_kubernetes_node_name]
    #     action: keep
    #     regex: .*mst.*
    #   - source_labels: [__meta_kubernetes_node_name]
    #     action: replace
    #     targetLabel: node
    #     regex: (.*)
    #     replacement: ${1}
    #   metric_relabel_configs:
    #   - regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone)
    #     action: labeldrop

    ## If additional scrape configurations are already deployed in a single secret file you can use this section.
    ## Expected values are the secret name and key
    ## Cannot be used with additionalScrapeConfigs
    additionalScrapeConfigsSecret: {}
      # enabled: false
      # name:
      # key:

    ## additionalPrometheusSecretsAnnotations allows to add annotations to the kubernetes secret. This can be useful
    ## when deploying via spinnaker to disable versioning on the secret, strategy.spinnaker.io/versioned: 'false'
    additionalPrometheusSecretsAnnotations: {}

    ## AdditionalAlertManagerConfigs allows for manual configuration of alertmanager jobs in the form as specified
    ## in the official Prometheus documentation https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<alertmanager_config>.
    ## AlertManager configurations specified are appended to the configurations generated by the Prometheus Operator.
    ## As AlertManager configs are appended, the user is responsible to make sure it is valid. Note that using this
    ## feature may expose the possibility to break upgrades of Prometheus. It is advised to review Prometheus release
    ## notes to ensure that no incompatible AlertManager configs are going to break Prometheus after the upgrade.
    ##
    additionalAlertManagerConfigs: []
    # - consul_sd_configs:
    #   - server: consul.dev.test:8500
    #     scheme: http
    #     datacenter: dev
    #     tag_separator: ','
    #     services:
    #       - metrics-prometheus-alertmanager

    ## AdditionalAlertRelabelConfigs allows specifying Prometheus alert relabel configurations. Alert relabel configurations specified are appended
    ## to the configurations generated by the Prometheus Operator. Alert relabel configurations specified must have the form as specified in the
    ## official Prometheus documentation: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alert_relabel_configs.
    ## As alert relabel configs are appended, the user is responsible to make sure it is valid. Note that using this feature may expose the
    ## possibility to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible alert relabel
    ## configs are going to break Prometheus after the upgrade.
    ##
    additionalAlertRelabelConfigs: []
    # - separator: ;
    #   regex: prometheus_replica
    #   replacement: $1
    #   action: labeldrop

    ## SecurityContext holds pod-level security attributes and common container settings.
    ## This defaults to non root user with uid 1000 and gid 2000.
    ## https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md
    ##
    securityContext:
      runAsGroup: 2000
      runAsNonRoot: true
      runAsUser: 1000
      fsGroup: 2000

    ##  Priority class assigned to the Pods
    ##
    priorityClassName: ""

    ## Thanos configuration allows configuring various aspects of a Prometheus server in a Thanos environment.
    ## This section is experimental, it may change significantly without deprecation notice in any release.
    ## This is experimental and may change significantly without backward compatibility in any release.
    ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#thanosspec
    ##
    thanos: {}

    ## Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.
    ##  if using proxy extraContainer  update targetPort with proxy container port
    containers: []

    ## InitContainers allows injecting additional initContainers. This is meant to allow doing some changes
    ## (permissions, dir tree) on mounted volumes before starting prometheus
    initContainers: []

    ## PortName to use for Prometheus.
    ##
    portName: "web"

    ## ArbitraryFSAccessThroughSMs configures whether configuration based on a service monitor can access arbitrary files
    ## on the file system of the Prometheus container e.g. bearer token files.
    arbitraryFSAccessThroughSMs: false

    ## OverrideHonorLabels if set to true overrides all user configured honor_labels. If HonorLabels is set in ServiceMonitor
    ## or PodMonitor to true, this overrides honor_labels to false.
    overrideHonorLabels: false

    ## OverrideHonorTimestamps allows to globally enforce honoring timestamps in all scrape configs.
    overrideHonorTimestamps: false

    ## IgnoreNamespaceSelectors if set to true will ignore NamespaceSelector settings from the podmonitor and servicemonitor
    ## configs, and they will only discover endpoints within their current namespace. Defaults to false.
    ignoreNamespaceSelectors: false

    ## PrometheusRulesExcludedFromEnforce - list of prometheus rules to be excluded from enforcing of adding namespace labels.
    ## Works only if enforcedNamespaceLabel set to true. Make sure both ruleNamespace and ruleName are set for each pair
    prometheusRulesExcludedFromEnforce: false

    ## QueryLogFile specifies the file to which PromQL queries are logged. Note that this location must be writable,
    ## and can be persisted using an attached volume. Alternatively, the location can be set to a stdout location such
    ## as /dev/stdout to log querie information to the default Prometheus log stream. This is only available in versions
    ## of Prometheus >= 2.16.0. For more details, see the Prometheus docs (https://prometheus.io/docs/guides/query-log/)
    queryLogFile: false

    ## EnforcedSampleLimit defines global limit on number of scraped samples that will be accepted. This overrides any SampleLimit
    ## set per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the SampleLimit to keep overall
    ## number of samples/series under the desired limit. Note that if SampleLimit is lower that value will be taken instead.
    enforcedSampleLimit: false

    ## AllowOverlappingBlocks enables vertical compaction and vertical query merge in Prometheus. This is still experimental
    ## in Prometheus so it may change in any upcoming release.
    allowOverlappingBlocks: false

  additionalRulesForClusterRole: []
  #  - apiGroups: [ "" ]
  #    resources:
  #      - nodes/proxy
  #    verbs: [ "get", "list", "watch" ]

  additionalServiceMonitors: []
  ## Name of the ServiceMonitor to create
  ##
    #- name: ""

    ## Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from
    ## the chart
    ##
    # additionalLabels: {}

    ## Service label for use in assembling a job name of the form <label value>-<port>
    ## If no label is specified, the service name is used.
    ##
    # jobLabel: ""

    ## labels to transfer from the kubernetes service to the target
    ##
    # targetLabels: []

    ## labels to transfer from the kubernetes pods to the target
    ##
    # podTargetLabels: []

    ## Label selector for services to which this ServiceMonitor applies
    ##
    # selector: {}

    ## Namespaces from which services are selected
    ##
    # namespaceSelector: []
      ## Match any namespace
      ##
      # any: false

      ## Explicit list of namespace names to select
      ##
      # matchNames: []

    ## Endpoints of the selected service to be monitored
    ##
    # endpoints: []
      ## Name of the endpoint's service port
      ## Mutually exclusive with targetPort
      # - port: ""

      ## Name or number of the endpoint's target port
      ## Mutually exclusive with port
      # - targetPort: ""

      ## File containing bearer token to be used when scraping targets
      ##
      #   bearerTokenFile: ""

      ## Interval at which metrics should be scraped
      ##
      #   interval: 30s

      ## HTTP path to scrape for metrics
      ##
      #   path: /metrics

      ## HTTP scheme to use for scraping
      ##
      #   scheme: http

      ## TLS configuration to use when scraping the endpoint
      ##
      #   tlsConfig:

          ## Path to the CA file
          ##
          # caFile: ""

          ## Path to client certificate file
          ##
          # certFile: ""

          ## Skip certificate verification
          ##
          # insecureSkipVerify: false

          ## Path to client key file
          ##
          # keyFile: ""

          ## Server name used to verify host name
          ##
          # serverName: ""

  additionalPodMonitors: []
  ## Name of the PodMonitor to create
  ##
  # - name: ""

    ## Additional labels to set used for the PodMonitorSelector. Together with standard labels from
    ## the chart
    ##
    # additionalLabels: {}

    ## Pod label for use in assembling a job name of the form <label value>-<port>
    ## If no label is specified, the pod endpoint name is used.
    ##
    # jobLabel: ""

    ## Label selector for pods to which this PodMonitor applies
    ##
    # selector: {}

    ## PodTargetLabels transfers labels on the Kubernetes Pod onto the target.
    ##
    # podTargetLabels: {}

    ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted.
    ##
    # sampleLimit: 0

    ## Namespaces from which pods are selected
    ##
    # namespaceSelector:
      ## Match any namespace
      ##
      # any: false

      ## Explicit list of namespace names to select
      ##
      # matchNames: []

    ## Endpoints of the selected pods to be monitored
    ## https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#podmetricsendpoint
    ##
    # podMetricsEndpoints: []

Настроим Elasticsearch-exporter

он есть в том же репозитории:
https://github.com/prometheus-community/helm-charts.git с которого мы запускали сам prometheus, лежит он тут:

helm-charts/charts/prometheus-elasticsearch-exporter

!!!!!!!!!!!!сам elasticsearch уже должен быть установлен.

смотрим нужные нам данные:

правим переменные:

vim helm-charts/charts/prometheus-elasticsearch-exporter/values.yaml

ранее мы добавляли лейбл командой:
проставим на них label release=prometheus
kubectl label namespace —all «prometheus=enabled»

проверяем что всё ок

kubectl get ns --show-labels | grep elk
elk                      Active   71d    prometheus=enabled

теперь смотрим на какое имя нам надо будет ссылаться:

kubectl get service -n elk | grep 9200
elasticsearch-master            ClusterIP   13.100.200.219   <none>        9200/TCP,9300/TCP   28d
elasticsearch-master-headless   ClusterIP   None             <none>        9200/TCP,9300/TCP   28d

в конфиге будем указывать elasticsearch-master

смотрим какие есть лейблы на этом сервисе:

kubectl describe service -n elk elasticsearch-master | grep -A4 Labels
Labels:            app=elasticsearch-master
                   app.kubernetes.io/managed-by=Helm
                   chart=elasticsearch
                   heritage=Helm
                   release=elasticsearch

нас интересует app=elasticsearch-master

правим конфиг:

vim prometheus-elasticsearch-exporter/values.yaml

es:
  uri: http://elasticsearch-master:9200

serviceMonitor:
  ## If true, a ServiceMonitor CRD is created for a prometheus operator
  ## https://github.com/coreos/prometheus-operator
  ##
  enabled: true
  namespace: monitoring
  labels:
    app: elasticsearch-master
    release: prometheus
  interval: 10s
  scrapeTimeout: 10s
  scheme: http
  relabelings: []
  targetLabels:
    app: elasticsearch-master
    release: prometheus
  metricRelabelings: []
  sampleLimit: 0

в полном виде конфиг выглядит так:

## number of exporter instances
##
replicaCount: 1

## restart policy for all containers
##
restartPolicy: Always

image:
  repository: justwatch/elasticsearch_exporter
  tag: 1.1.0
  pullPolicy: IfNotPresent
  pullSecret: ""

## Set enabled to false if you don't want securityContext
## in your Deployment.
## The below values are the default for kubernetes.
## Openshift won't deploy with runAsUser: 1000 without additional permissions.
securityContext:
  enabled: true  # Should be set to false when running on OpenShift
  runAsUser: 1000

# Custom DNS configuration to be added to prometheus-elasticsearch-exporter pods
dnsConfig: {}
# nameservers:
#   - 1.2.3.4
# searches:
#   - ns1.svc.cluster-domain.example
#   - my.dns.search.suffix
# options:
#   - name: ndots
#     value: "2"
#   - name: edns0

log:
  format: logfmt
  level: info

resources: {}
  # requests:
  #   cpu: 100m
  #   memory: 128Mi
  # limits:
  #   cpu: 100m
  #   memory: 128Mi

priorityClassName: ""

nodeSelector: {}

tolerations: []

podAnnotations: {}

podLabels: {}

affinity: {}

service:
  type: ClusterIP
  httpPort: 9108
  metricsPort:
    name: http
  annotations: {}
  labels: {}

## Extra environment variables that will be passed into the exporter pod
## example:
## env:
##   KEY_1: value1
##   KEY_2: value2
env: {}

## The name of a secret in the same kubernetes namespace which contain values to be added to the environment
## This can be useful for auth tokens, etc
envFromSecret: ""

## A list of environment variables from secret refs that will be passed into the exporter pod
## example:
## This will set ${ES_PASSWORD} to the 'password' key from the 'my-secret' secret
## extraEnvSecrets:
##   ES_PASSWORD:
##     secret: my-secret
##     key: password
extraEnvSecrets: {}

# A list of secrets and their paths to mount inside the pod
# This is useful for mounting certificates for security
secretMounts: []
#  - name: elastic-certs
#    secretName: elastic-certs
#    path: /ssl

# A list of additional Volume to add to the deployment
# this is useful if the volume you need is not a secret (csi volume etc.)
extraVolumes: []
#  - name: csi-volume
#    csi:
#      driver: secrets-store.csi.k8s.io
#      readOnly: true
#      volumeAttributes:
#        secretProviderClass: my-spc

#  A list of additional VolumeMounts to add to the deployment
#  this is useful for mounting any other needed resource into
#  the elasticsearch-exporter pod
extraVolumeMounts: []
#  - name: csi-volume
#    mountPath: /csi/volume
#    readOnly: true

es:
  ## Address (host and port) of the Elasticsearch node we should connect to.
  ## This could be a local node (localhost:9200, for instance), or the address
  ## of a remote Elasticsearch server. When basic auth is needed,
  ## specify as: <proto>://<user>:<password>@<host>:<port>. e.g., http://admin:pass@localhost:9200.
  ##
  uri: http://elasticsearch-master:9200

  ## If true, query stats for all nodes in the cluster, rather than just the
  ## node we connect to.
  ##
  all: true

  ## If true, query stats for all indices in the cluster.
  ##
  indices: true

  ## If true, query settings stats for all indices in the cluster.
  ##
  indices_settings: true

  ## If true, query stats for shards in the cluster.
  ##
  shards: true

  ## If true, query stats for snapshots in the cluster.
  ##
  snapshots: true

  ## If true, query stats for cluster settings.
  ##
  cluster_settings: false

  ## Timeout for trying to get stats from Elasticsearch. (ex: 20s)
  ##
  timeout: 30s

  ## Skip SSL verification when connecting to Elasticsearch
  ## (only available if image.tag >= 1.0.4rc1)
  ##
  sslSkipVerify: false


  ssl:
    ## If true, a secure connection to ES cluster is used
    ##
    enabled: false

    ## If true, certs from secretMounts will be need to be referenced instead of certs below
    ##
    useExistingSecrets: false

    ca:

      ## PEM that contains trusted CAs used for setting up secure Elasticsearch connection
      ##
      # pem:

      # Path of ca pem file which should match a secretMount path
      path: /ssl/ca.pem
    client:
      ## if true, client SSL certificate is used for authentication
      ##
      enabled: true

      ## PEM that contains the client cert to connect to Elasticsearch.
      ##
      # pem:

      # Path of client pem file which should match a secretMount path
      pemPath: /ssl/client.pem

      ## Private key for client auth when connecting to Elasticsearch
      ##
      # key:

      # Path of client key file which should match a secretMount path
      keyPath: /ssl/client.key
web:
  ## Path under which to expose metrics.
  ##
  path: /metrics

serviceMonitor:
  ## If true, a ServiceMonitor CRD is created for a prometheus operator
  ## https://github.com/coreos/prometheus-operator
  ##
  enabled: true
  namespace: monitoring
  labels:
    app: elasticsearch-master
    release: prometheus
  interval: 10s
  scrapeTimeout: 10s
  scheme: http
  relabelings: []
  targetLabels:
    app: elasticsearch-master
    release: prometheus
  metricRelabelings: []
  sampleLimit: 0

prometheusRule:
  ## If true, a PrometheusRule CRD is created for a prometheus operator
  ## https://github.com/coreos/prometheus-operator
  ##
  ## The rules will be processed as Helm template, allowing to set variables in them.
  enabled: false
  #  namespace: monitoring
  labels: {}
  rules: []
    # - record: elasticsearch_filesystem_data_used_percent
    #   expr: |
    #     100 * (elasticsearch_filesystem_data_size_bytes{service="{{ template "elasticsearch-exporter.fullname" . }}"} - elasticsearch_filesystem_data_free_bytes{service="{{ template "elasticsearch-exporter.fullname" . }}"})
    #     / elasticsearch_filesystem_data_size_bytes{service="{{ template "elasticsearch-exporter.fullname" . }}"}
    # - record: elasticsearch_filesystem_data_free_percent
    #   expr: 100 - elasticsearch_filesystem_data_used_percent{service="{{ template "elasticsearch-exporter.fullname" . }}"}
    # - alert: ElasticsearchTooFewNodesRunning
    #   expr: elasticsearch_cluster_health_number_of_nodes{service="{{ template "elasticsearch-exporter.fullname" . }}"} < 3
    #   for: 5m
    #   labels:
    #     severity: critical
    #   annotations:
    #     description: There are only {{ "{{ $value }}" }} < 3 ElasticSearch nodes running
    #     summary: ElasticSearch running on less than 3 nodes
    # - alert: ElasticsearchHeapTooHigh
    #   expr: |
    #     elasticsearch_jvm_memory_used_bytes{service="{{ template "elasticsearch-exporter.fullname" . }}", area="heap"} / elasticsearch_jvm_memory_max_bytes{service="{{ template "elasticsearch-exporter.fullname" . }}", area="heap"}
    #     > 0.9
    #   for: 15m
    #   labels:
    #     severity: critical
    #   annotations:
    #     description: The heap usage is over 90% for 15m
    #     summary: ElasticSearch node {{ "{{ $labels.node }}" }} heap usage is high

# Create a service account
# To use a service account not handled by the chart, set the name here
# and set create to false
serviceAccount:
  create: false
  name: default

# Creates a PodSecurityPolicy and the role/rolebinding
# allowing the serviceaccount to use it
podSecurityPolicies:
  enabled: false

можем устанавливать:

helm install elasticsearch-exporter —values prometheus-elasticsearch-exporter/values.yaml prometheus-elasticsearch-exporter/ -n elk

через какое-то время появится target

4.exporter rabbitmq

прогоняем label по всем namespace
kubectl label namespace —all «prometheus=enabled»

у меня уже установлен rabbitmq в кластере в namespace rabbitmq, прометеус в namespace monitoring

пароль от rabbitmq у меня закрыт в секрете:

kubectl get secrets -n rabbitmq  | grep pass
secret-admin-password            Opaque                                1      4d22h

vim prometheus-rabbitmq-exporter/values.yaml

loglevel: info
rabbitmq:
  url: http://rabbitmq-headless.rabbitmq.svc.test.local:15672
  user: admin
  password: secret-admin-password
  # If existingPasswordSecret is set then password is ignored
  existingPasswordSecret: ~



prometheus:
  monitor:
    enabled: true
    additionalLabels:
      release: prometheus
    interval: 15s
    namespace: []

helm install rabbitmq-exporter prometheus-rabbitmq-exporter/ -n monitoring —values prometheus-rabbitmq-exporter/values.yaml

5. exporter redis

прогоняем label по всем namespace
kubectl label namespace —all «prometheus=enabled»

у меня уже установлен redis в кластере в namespace redis, прометеус в namespace monitoring

пароль от redis у меня закрыт в секрете:

[root@prod-vsrv-kubemaster1 charts]# kubectl get secrets -n redis  | grep -E 'NAME|password'
NAME                                                   TYPE                                  DATA   AGE
redis-password                                         Opaque                                1      27h

vim prometheus-redis-exporter/values.yaml

redisAddress: redis://redis-cluster-headless.redis.svc.test.local:6379

serviceMonitor:
  enabled: true
  namespace: monitoring
  # Set labels for the ServiceMonitor, use this to define your scrape label for Prometheus Operator
  labels:
    release: prometheus

auth:
  # Use password authentication
  enabled: true
  # Use existing secret (ignores redisPassword)
  secret:
    name: redis-password
    key: redis-password

helm install redis-exporter prometheus-redis-exporter/ -n redis —values prometheus-redis-exporter/values.yaml

6. настройка оповещений в telegram

Для начала создадим telegram bot

идём на @BotFather

нажимаем start и получаем список команд:

/newbot — отправляем ему и бот просит придумать имя нашему новому боту. Единственное ограничение на имя — оно должно оканчиваться на «bot». В случае успеха BotFather возвращает токен бота и ссылку для быстрого добавления бота в контакты, иначе придется поломать голову над именем.

всё мы зарегались, теперь этот токен можно использовать при подключении нашего алертменеджера к телеграму

cat default.tmpl

{{ define "telegram.default" }}
{{ range .Alerts }}
{{ if eq .Status "firing"}}?  <b>{{ .Status | toUpper }}</b> ? {{ else }}<b>{{ .Status | toUpper }}</b>{{ end }}
<b>{{ .Labels.alertname }}</b>
{{ .Annotations.message }} {{ .Annotations.description }}
<b>Duration:</b> {{ duration .StartsAt .EndsAt }}{{ if ne .Status "firing"}}
<b>Ended:</b> {{ .EndsAt | since }}{{ end }}
{{ end }}
{{ end }}

cat Dockerfile

FROM metalmatze/alertmanager-bot:0.4.2

COPY ./default.tmpl /templates/default.tmpl

собираем образ пушим в наш гитлаб

далее идём в телеграмм в канал:

userinfobot
печатаем старт и получаем наш id

далее выполняем:

echo -n «4196184» | base64
получаем хэш
NDE5NjE4NA==

а так же получаем хэш нашего телеграм токена:

echo -n «1788359733:AAFf3cK6dfEPHV5e7ePXnHP6x6GHWzEQoSw» | base64
MTc4ODM1OTczMzpBQUZmM2NLNmRmRVBIVjVlN2VQWG5IUDZ4NkdIV3pFUW9Tdw==

создаём deployment

cat telegrambot.yml

apiVersion: v1
items:
- apiVersion: v1
  data:
    admin1: NDE5NjE4NA
    admin2: NTY
    admin3: NDE5
    token: MTc4ODM1OTczMzpBQUZmM2NLNmRmRVBIVjVlN2VQWG5IUDZ4NkdIV3pFUW9Tdw==
  kind: Secret
  metadata:
    labels:
      app.kubernetes.io/name: alertmanager-bot
    name: alertmanager-bot
    namespace: monitoring
  type: Opaque
- apiVersion: v1
  kind: Service
  metadata:
    labels:
      app.kubernetes.io/name: alertmanager-bot
    name: alertmanager-bot
    namespace: monitoring
  spec:
    ports:
    - name: http
      port: 8080
      targetPort: 8080
    selector:
      app.kubernetes.io/name: alertmanager-bot
- apiVersion: apps/v1
  kind: StatefulSet
  metadata:
    labels:
      app.kubernetes.io/name: alertmanager-bot
    name: alertmanager-bot
    namespace: monitoring
  spec:
    podManagementPolicy: OrderedReady
    replicas: 1
    selector:
      matchLabels:
        app.kubernetes.io/name: alertmanager-bot
    serviceName: alertmanager-bot
    template:
      metadata:
        labels:
          app.kubernetes.io/name: alertmanager-bot
        name: alertmanager-bot
        namespace: monitoring
      spec:
        containers:
        - args:
          - --alertmanager.url=http://alertmanager-operated:9093
          - --log.level=info
          - --store=bolt
          - --bolt.path=/data/bot.db
          - --telegram.admin=4196184
          - --telegram.admin=56
          - --telegram.admin=41
          env:
     #     - name: TELEGRAM_ADMIN
     #       valueFrom:
     #         secretKeyRef:
     #           key: admin
     #           name: alertmanager-bot
          - name: TELEGRAM_TOKEN
            valueFrom:
              secretKeyRef:
                key: token
                name: alertmanager-bot
          image: gitlab.test.local:4567/monitoring/alertbot
          imagePullPolicy: IfNotPresent
          name: alertmanager-bot
          ports:
          - containerPort: 8080
            name: http
          resources:
            limits:
              cpu: 100m
              memory: 128Mi
            requests:
              cpu: 25m
              memory: 64Mi
          volumeMounts:
          - mountPath: /data
            name: alertmanager-bot
        restartPolicy: Always
        imagePullSecrets:
        - name: regcred

    volumeClaimTemplates:
    - metadata:
        labels:
          app.kubernetes.io/name: alertmanager-bot
        name: alertmanager-bot
        namespace: monitoring
      spec:
        accessModes:
        - ReadWriteMany
        resources:
          requests:
            storage: 1Gi
        storageClassName: nfs-storageclass
kind: List

admin1 — тут указываю хэши id пользователей которые будут заходить
token — тут указываем токен нашего телеграм бота (хэш)
namespace — тут указываем неймспейс в котором у нас запущен prometheus
image — тут указываем образ телеграмбота пересобранного и загруженного в наш гитлаб
— —telegram.admin — тут id пользователей в открытом виде

можем запускать:
kubectl apply -f telegrambot.yml

всё можно проверять:
пишем /start
и бот отвечает:

6.1 настройка оповещений в telegram, в различные чаты(группы)

Задача — настроить оповещения в разные чаты телеграмма

за основу будет взят телеграм бот:
https://github.com/inCaller/prometheus_bot
который был заточен под helm chart
https://github.com/gvych/telegram-bot-helm-chart
отмечу сразу что его надо дописывать в values так как с нуля он не стартует.

приступим, создаём в телеграм новую группу:

добавляем нашего бота которого мы создали в предыдущем пункте, так как я дополняю статью позже, то имя бота у меня другое:

далее добавляем к группе бота который позволит увидеть chatid

вот мы получили chatid запомним его.

Выкачиваем репозиторий:

git clone https://github.com/gvych/telegram-bot-helm-chart.git

cd telegram-bot-helm-chart

правим версию для deployment и выславляем selector изначально их нету в гите, общий вид будет такой:

cat telegram-bot/templates/deployment.yaml

{{- if not .Values.application.initializeCommand -}}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ template "trackableappname" . }}
  labels:
    app: {{ template "appname" . }}
    track: "{{ .Values.application.track }}"
    tier: "{{ .Values.application.tier }}"
    chart: "{{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}"
    release: {{ .Release.Name }}
    heritage: {{ .Release.Service }}
spec:
  replicas: {{ .Values.replicaCount }}
  selector:
    matchLabels:
      app: {{ template "appname" . }}
  template:
    metadata:
      annotations:
        checksum/application-secrets: "{{ .Values.application.secretChecksum }}"
      labels:
        app: {{ template "appname" . }}
        track: "{{ .Values.application.track }}"
        tier: "{{ .Values.application.tier }}"
        release: {{ .Release.Name }}
    spec:
      imagePullSecrets:
{{ toYaml .Values.image.secrets | indent 10 }}
      volumes:
      - configMap:
          defaultMode: 420
          name: {{ template "trackableappname" . }}.config
        name: config-volume

      containers:
      - name: {{ .Chart.Name }}
        image: "moghaddas/prometheus_bot"
        imagePullPolicy: {{ .Values.image.pullPolicy }}
        volumeMounts:
        - mountPath: /config.yaml
          name: config-volume
          subPath: config.yaml
        - mountPath: /alert.tmpl
          name: config-volume
          subPath: alert.tmpl
        ports:
        - name: "{{ .Values.service.name }}"
          containerPort: {{ .Values.service.internalPort }}
        livenessProbe:
          tcpSocket:
            port: {{ .Values.service.internalPort }}
          initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }}
          timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }}
        readinessProbe:
          tcpSocket:
            port: {{ .Values.service.internalPort }}
          initialDelaySeconds: {{ .Values.readinessProbe.initialDelaySeconds }}
          timeoutSeconds: {{ .Values.readinessProbe.timeoutSeconds }}
        resources:
{{ toYaml .Values.resources | indent 12 }}
{{- end -}}

также правим конфиг, чтобы в уведомлении видно было alertname, description, message — по умолчанию их нету в дефолте.

telegram-bot-helm/templates/configmap.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: {{ template "trackableappname" . }}.config
  labels:
    app: {{ template "appname" . }}
    chart: "{{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}"
    release: {{ .Release.Name }}
    heritage: {{ .Release.Service }}
data:
  config.yaml: |-
    telegram_token: "{{ .Values.telegram.token }}"
    template_path: "/alert.tmpl"
    time_zone: "UTC"
    split_token: "|"
    split_msg_byte: 4000
  alert.tmpl: |-
    {{ "{{" }} if eq .Status "firing" {{ "}}" }} <b>[PROBLEM]</b> {{ "{{" }} else {{ "}}" }} <b>[RECOVERY]</b> {{ "{{" }} end {{ "}}" }}
    {{ "{{" }} index (index .Alerts 0).Labels "alertname" {{ "}}" }}
    {{ "{{" }} index (index .Alerts 0).Annotations "description"{{ "}}" }}
    {{ "{{" }} index (index .Alerts 0).Annotations "message"{{ "}}" }}

теперь правим файл с переменными:

cat telegram-bot/values.yaml

# Default values for chart.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
telegram:
  token: "139453456058954:AAFaZIepDdfggTql0dfgdfPT6dq1Edqu5dfggxFD6Wr08j0g"
  chat_id: "-567616984" #not used at the moment

releaseOverride: alertmanager-bot-chat-id
image:
  pullPolicy: Always
application:
  track: stable
  tier: web
  migrateCommand:
  initializeCommand:
  secretName:
  secretChecksum:
service:
  enabled: true
  name: web
  type: ClusterIP
  url: http://alertmanager-operated:9093
  additionalHosts:
  commonName:
  externalPort: 9087
  internalPort: 9087

livenessProbe:
  initialDelaySeconds: 15
readinessProbe:
  initialDelaySeconds: 5


resources:
  limits:
    cpu: 100m
    memory: 128Mi
  requests:
    cpu: 10m
    memory: 8Mi

здесь token — это токен нашего телеграм бота мы его получаем при его регистрации в botfather

chat_id — это id нашей группы

url: http://alertmanager-operated:9093 это наш адрес alermanager увидеть его можно следующим образом:

kubectl get service -n monitoring | grep alertmanager-operated
alertmanager-operated                            ClusterIP   None             <none>        9093/TCP,9094/TCP,9094/UDP   57d

для каждой группы мы будем запускать свой телеграмбот, вот второй:

cat telegram-bot/values-test.yaml

# Default values for chart.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
telegram:
  token: "139453456058954:AAFaZIepDdfggTql0dfgdfPT6dq1Edqu5dfggxFD6Wr08j0g"
  chat_id: "-480100545" #not used at the moment

releaseOverride: alertmanager-bot-test
image:
  pullPolicy: Always
application:
  track: stable
  tier: web
  migrateCommand:
  initializeCommand:
  secretName:
  secretChecksum:
service:
  enabled: true
  name: web
  type: ClusterIP
  url: http://alertmanager-operated:9093
  additionalHosts:
  commonName:
  externalPort: 9087
  internalPort: 9087

livenessProbe:
  initialDelaySeconds: 15
readinessProbe:
  initialDelaySeconds: 5


resources:
  limits:
    cpu: 100m
    memory: 128Mi
  requests:
    cpu: 10m
    memory: 8Mi

ставим первый:
helm upgrade —install -name telegram-bot-chat-id telegram-bot/ -f telegram-bot/values.yaml —namespace monitoring
и второй:
helm upgrade —install -name telegram-bot-test telegram-bot/ -f telegram-bot/values-test.yaml —namespace monitoring

далее создаём своё кастомное правило:

cat prometheus-alert-rule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  annotations:
    meta.helm.sh/release-name: prometheus
    meta.helm.sh/release-namespace: monitoring
    prometheus-operator-validated: "true"
  labels:
    app: kube-prometheus-stack
    release: prometheus
  name: my-test-prometheus-alertmanager.rules
  namespace: monitoring
  selfLink: /apis/monitoring.coreos.com/v1/namespaces/monitoring/prometheusrules/my-test-prometheus-alertmanager.rules
spec:
  groups:
  - name: my-test-alertmanager.rules
    rules:
    - alert: EBNULSA_CONTAINER
      annotations:
        message: |
          CONTAINER_UMER
          Namespace: {{ $labels.namespace }} and
          Podname: {{ $labels.pod }}
      expr: sum_over_time(kube_pod_container_status_ready{namespace="my-site"}[5m])
        <1
      for: 1m
      labels:
        severity: critical
        team: namespace-my-site

и второе:

cat prometheus-alert-rule-test.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  annotations:
    meta.helm.sh/release-name: prometheus
    meta.helm.sh/release-namespace: monitoring
    prometheus-operator-validated: "true"
  labels:
    app: kube-prometheus-stack
    release: prometheus
  name: test-prometheus-alertmanager.rules
  namespace: monitoring
  selfLink: /apis/monitoring.coreos.com/v1/namespaces/monitoring/prometheusrules/test-prometheus-alertmanager.rules
spec:
  groups:
  - name: test-alertmanager.rules
    rules:
    - alert: EBNULSA_CONTAINER-namespace-test
      annotations:
        message: |
          CONTAINER_UMER
          Namespace: {{ $labels.namespace }} and
          Podname: {{ $labels.pod }}
      expr: sum_over_time(kube_pod_container_status_ready{namespace="test"}[5m])
        <1
      for: 1m
      labels:
        severity: critical
        team: namespace-test

тут обращаем внимание на разное название лейблов

team: namespace-my-site

team: namespace-test

согласно данным лейблам алертменеджер будет раскидывать в нужные группы.

смотрим что данные правила создались:

kubectl -n monitoring get prometheusrules.monitoring.coreos.com

NAME                                                              AGE
my-test-prometheus-alertmanager.rules                             19h
prometheus-kube-prometheus-alertmanager.rules                     57d
prometheus-kube-prometheus-etcd                                   57d
prometheus-kube-prometheus-general.rules                          57d
prometheus-kube-prometheus-k8s.rules                              57d
prometheus-kube-prometheus-kube-apiserver-availability.rules      57d
prometheus-kube-prometheus-kube-apiserver-slos                    57d
prometheus-kube-prometheus-kube-apiserver.rules                   57d
prometheus-kube-prometheus-kube-prometheus-general.rules          57d
prometheus-kube-prometheus-kube-prometheus-node-recording.rules   57d
prometheus-kube-prometheus-kube-scheduler.rules                   57d
prometheus-kube-prometheus-kube-state-metrics                     57d
prometheus-kube-prometheus-kubelet.rules                          57d
prometheus-kube-prometheus-kubernetes-apps                        57d
prometheus-kube-prometheus-kubernetes-resources                   57d
prometheus-kube-prometheus-kubernetes-storage                     57d
prometheus-kube-prometheus-kubernetes-system                      57d
prometheus-kube-prometheus-kubernetes-system-apiserver            57d
prometheus-kube-prometheus-kubernetes-system-controller-manager   57d
prometheus-kube-prometheus-kubernetes-system-kubelet              57d
prometheus-kube-prometheus-kubernetes-system-scheduler            57d
prometheus-kube-prometheus-node-exporter                          57d
prometheus-kube-prometheus-node-exporter.rules                    57d
prometheus-kube-prometheus-node-network                           57d
prometheus-kube-prometheus-node.rules                             57d
prometheus-kube-prometheus-prometheus                             57d
prometheus-kube-prometheus-prometheus-operator                    57d
test-prometheus-alertmanager.rules                                18h

далее правим конфиг алертменеджера, не забываем что в нашем случае это гит:
https://github.com/prometheus-community/helm-charts.git

правим файл:
helm-charts/charts/kube-prometheus-stack/values.yaml

alertmanager:
  ## Deploy alertmanager
  enabled: true
  apiVersion: v2
  serviceAccount:
    create: true
    name: ""
    annotations: {}
  podDisruptionBudget:
    enabled: false
    minAvailable: 1
    maxUnavailable: ""

  config:
    global:
      resolve_timeout: 5m
      smtp_smarthost: 10.230.144.56:25
    route:
       # receiver: 'telegram'
        receiver: 'email_unixadmins'
        routes:
        - receiver: "telegram"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            severity: "critical|warning"
          continue: true
        - receiver: "telegram"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            alertname: "Watchdog"
          continue: true
        - receiver: "email_unixadmins"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            severity: "critical"
          continue: true
        - receiver: "telegram-my-site"
          match_re:
            severity: "critical"
            team: "namespace-my-site"
        - receiver: "telegram-test"
          match_re:
            severity: "critical"
            team: "namespace-test"

    receivers:
        - name: 'telegram'
          webhook_configs:
              - send_resolved: true
#                url: 'http://alertmanager-bot:8080'
                url: 'http://alertmanager-bot-chat-id:9087/alert/-567616984'

        - name: 'telegram-my-site'
          webhook_configs:
              - send_resolved: true
                url: 'http://alertmanager-bot-for-my-site:9087/alert/-581835428'

        - name: 'telegram-test'
          webhook_configs:
              - send_resolved: true
                url: 'http://alertmanager-bot-test:9087/alert/-480100545'


        - name: 'email_unixadmins'
          email_configs:
              - to: 'admin1@test.ru'
                from: 'prod-vsrv-kuber-alertmanager@test.ru'
                require_tls: false
                send_resolved: true
              - to: 'admin2@test.ru'
                from: 'prod-vsrv-kuber-alertmanager@test.ru'
                require_tls: false
                send_resolved: true

обращаю внимание, что установить несколько типов severity можно в таком виде:

severity: «critical|warning»

запись вида:
continue: true
(по умолчанию она false) означает что после первого совпадения надо продолжать роутить сообщения

всё дальше можно апдейтить:

helm upgrade —install -name prometheus kube-prometheus-stack/ -f kube-prometheus-stack/values.yaml —namespace monitoring

6.2. настройка оповещений в telegram разграничение оповещений по группам (исключения уведомлений)

Вводная: есть админский чат и есть чат разработчиков. при настройке как в пункте 6.1 уведомления приходящие в чат разрабочиков дублируются и в чат админов.

данная ситуация происходит вообще потому, что alertmanager со следующим конфигом:

  config:
    global:
      resolve_timeout: 5m
      smtp_smarthost: 10.230.144.56:25
    route:
       # receiver: 'telegram'
        receiver: 'email_unixadmins'
        routes:
        - receiver: "telegram-admins"
          group_wait: 10s
          repeat_interval: 1h
          match_re:
            severity: "critical|warning"
          continue: true
        - receiver: "telegram-admins"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            alertname: "Watchdog"
          continue: true

        - receiver: "telegram-terminal-soft"
          group_wait: 10s
          repeat_interval: 1h
          match_re:
            severity: "critical"
            team: "terminal-soft"
          continue: true

имеет настройку
continue: true (по дефолту false)
благодаря которой уведомления попав под первое правило не прекращаются а отправляются дальше по route и отправляются по другим receiver (когда совпадают по label)

ВАЖНО!!!!!!!!!!!!!! в записи:

          match_re:
            severity: "critical"
            team: "terminal-soft"

правила совпадения работают не как OR а как AND (т.е. должны совпасть ВСЕ лейблы)

Задача, исключить из чата админов сообщения отправляемые в чат разрабочиков, чтобы админам прилетали все дефолтные

Решение — возможно тупенькое но я другого не нашёл, работать будет так:

прилетает сообщение, с лейблами:
severity: «critical»
team: «terminal-soft»

значит оно должно попасть только в группу terminal-soft, поэтому для receiver: «telegram-terminal-soft» оставляем
match_re:
team: «terminal-soft»

но так как в уведомлении будет прителать лейбл
severity: «critical» то он будет попадать под совпадение receiver: «telegram-admins» у которого
match_re:
severity: «critical|warning»

нам этого не нужно поэтому для
receiver: «telegram-terminal-soft»
ставим continue: false и тогда обработка следующих routes не будет происходить.

вывод перед админским чатом правило должно быть с условием:
continue: false
а админский чат последний в списке.

теперь рассмотрим всё это по конфигам:

правило по которому будет срабатывать алерт:

test.rule.yml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  annotations:
    meta.helm.sh/release-name: prometheus
    meta.helm.sh/release-namespace: monitoring
    prometheus-operator-validated: "true"
  labels:
    app: kube-prometheus-stack
    release: prometheus
  name: mega-mega24-cloud-prometheus-alertmanager.rules
  namespace: monitoring
  selfLink: /apis/monitoring.coreos.com/v1/namespaces/monitoring/prometheusrules/mega-mega24-cloud-prometheus-alertmanager.rules
spec:
  groups:
  - name: mega-mega24-cloud-alertmanager.rules
    rules:
    - alert: EBNULSA_CONTAINER
      annotations:
        message: |
          CONTAINER_UMER
          Namespace: {{ $labels.namespace }} and
          Podname: {{ $labels.pod }}
      expr: sum_over_time(kube_pod_container_status_ready{namespace="mega-mega24-cloud"}[2m])
        <1
      for: 1m
      labels:
        team: "terminal-soft"

тут обращаем внимание на наличие лейбла team: «terminal-soft» и отсутствие лейбла severity: critical

конфиг алерт менеджера:

cat helm-charts/charts/kube-prometheus-stack/values.yaml

  ## Alertmanager configuration directives
  ## ref: https://prometheus.io/docs/alerting/configuration/#configuration-file
  ##      https://prometheus.io/webtools/alerting/routing-tree-editor/
  ##

  config:
    global:
      resolve_timeout: 5m
      smtp_smarthost: 10.230.144.56:25
    route:
       # receiver: 'telegram'
        receiver: 'email_unixadmins'
        routes:

        - receiver: "email_unixadmins"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            severity: "critical"
          continue: true

        - receiver: "telegram-admins"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            alertname: "Watchdog"
          continue: true

        - receiver: "telegram-terminal-soft"
          group_wait: 10s
          repeat_interval: 1h
          match_re:
            team: "terminal-soft"
          continue: false

        - receiver: "telegram-admins"
          group_wait: 10s
          repeat_interval: 1h
          match_re:
            severity: "critical|warning"
          continue: true


    receivers:
        - name: 'telegram-admins'
          webhook_configs:
              - send_resolved: true
                url: 'http://telegram-admins-group:9087/alert/-1001441100259'

        - name: 'telegram-terminal-soft'
          webhook_configs:
              - send_resolved: true
                url: 'http://telegram-terminal-soft:9087/alert/-597056946'


        - name: 'email_unixadmins'
          email_configs:
              - to: 'user1@test.ru'
                from: 'prod-vsrv-kuber-alertmanager@test.ru'
                require_tls: false
                send_resolved: true
              - to: 'user2@test.ru'
                from: 'prod-vsrv-kuber-alertmanager@test.ru'
                require_tls: false
                send_resolved: true

тут видим что предпоследнее правило имеет вид:

— receiver: «telegram-terminal-soft»
group_wait: 10s
repeat_interval: 1h
match_re:
team: «terminal-soft»
continue: false

а последнее правило для работы дефолтных правил (которые есть в prometheus по умолчанию)

— receiver: «telegram-admins»
group_wait: 10s
repeat_interval: 1h
match_re:
severity: «critical|warning»
continue: true

7.Проблема с prometheus-kube-proxy

столкнулся со следующей проблемой, после запуска прометеуса не отображаются метрики с kube-proxy

прикол в следующем, сам kube-proxy стартанул на 127,0,0,1

[root@kub-worker-2 ~]# netstat -ntpl | grep 10249
tcp        0      0 127.0.0.1:10249         0.0.0.0:*               LISTEN      2537/kube-proxy

а прометеус лезет на айпишник т.е. щимится на ноды а там ни кто не отвечает:
[root@kub-master-1 charts]# telnet 192.168.1.205 10249
Trying 192.168.1.205…
telnet: connect to address 192.168.1.205: Connection refused

что для исправления делаем, НА ВСЕХ НОДАХ правим:

[root@kub-master-1 charts]# vim /etc/kubernetes/kube-proxy-config.yaml
c
metricsBindAddress: 127.0.0.1:10249
на
metricsBindAddress: 0.0.0.0:10249

общий вид у файла такой:

apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
bindAddress: 0.0.0.0
clientConnection:
 acceptContentTypes:
 burst: 10
 contentType: application/vnd.kubernetes.protobuf
 kubeconfig: /etc/kubernetes/kube-proxy-kubeconfig.yaml
 qps: 5
clusterCIDR: 10.0.0.0/16
configSyncPeriod: 15m0s
conntrack:
 maxPerCore: 32768
 min: 131072
 tcpCloseWaitTimeout: 1h0m0s
 tcpEstablishedTimeout: 24h0m0s
enableProfiling: False
healthzBindAddress: 127.0.0.1
hostnameOverride: kub-master-1
iptables:
 masqueradeAll: False
 masqueradeBit: 14
 minSyncPeriod: 0s
 syncPeriod: 30s
ipvs:
 excludeCIDRs: []
 minSyncPeriod: 0s
 scheduler: rr
 syncPeriod: 30s
 strictARP: False
metricsBindAddress: 0.0.0.0:10249
mode: iptables
nodePortAddresses: []
oomScoreAdj: -999
portRange:
udpIdleTimeout: 250ms

далее перезапускаем:

[root@kub-master-1 charts]# kubectl delete pod -n kube-system kube-proxy-kub-master-1 kube-proxy-kub-master-2 kube-proxy-kub-master-3 kube-proxy-kub-worker-1 kube-proxy-kub-worker-2
pod «kube-proxy-kub-master-1» deleted
pod «kube-proxy-kub-master-2» deleted
pod «kube-proxy-kub-master-3» deleted
pod «kube-proxy-kub-worker-1» deleted
pod «kube-proxy-kub-worker-2» deleted

Проверяем доступность:

[root@kub-master-1 charts]# telnet 192.168.1.205 10249
Trying 192.168.1.205…
Connected to 192.168.1.205.
Escape character is ‘^]’.
^]
telnet> quit
Connection closed.

и как видим метрики теперь отображаются:

8.Настройка алерта для определённого namespace

У меня есть тестовый сервис:

cat my-site-ingress.yaml

---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
  name: my-ingress
  namespace: my-site
spec:
  rules:
  - host: test.ru  #тут указывается наш домен
    http:
      paths:  #список путей которые хотим обслуживать(он дефолтный и все запросы будут отправляться на бэкенд, т.е. на сервис my-service-apache)
      - backend:
          serviceName: my-service-apache  #тут указывается наш сервис
          servicePort: 80 #порт на котором сервис слушает
#        path: /  все запросы на корень '/' будут уходить на наш сервис

cat my-site-service.yaml

---
apiVersion: v1
kind: Service
metadata:
  name: my-service-apache # имя сервиса
  namespace: my-site
spec:
  ports:
  - port: 80  # принимать на 80
    targetPort: 80 # отправлять на 80
  selector:
    app: apache  #отправлять на все поды с данным лейблом
  type: ClusterIP

cat my-site.yaml

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: my-deployment-apache
  namespace: my-site
spec:
  replicas: 1
  selector:
    matchLabels:
      app: apache # по вот этому лейблу репликасет цепляет под
# тут описывается каким мокаром следует обновлять поды
  strategy:
    rollingUpdate:
      maxSurge: 1  # указывает на какое количество реплик можно увеличить
      maxUnavailable: 1 # указывает на какое количество реплик можно уменьшить
#т.е. в одно время при обновлении, будет увеличено на один (новый под) и уменьшено на один (старый под)
    type: RollingUpdate
## тут начинается описание контейнера
  template:
    metadata:
      labels:
        app: apache  # по вот этому лейблу репликасет цепляет под
    spec:
      containers:
        - image: httpd:2.4.43
          name: apache
          ports:
            - containerPort: 80
# тут начинаются проверки по доступности
          readinessProbe: # проверка готово ли приложение
            failureThreshold: 3 #указывает количество провалов при проверке
            httpGet:  # по сути дёргает курлом на 80 порт
              path: /
              port: 80
            periodSeconds: 10 #как часто должна проходить проверка (в секундах)
            successThreshold: 1 #сбрасывает счётчик неудач, т.е. при 3х проверках если 1 раз успешно прошло, то счётчик сбрасывается и всё ок
            timeoutSeconds: 1 #таймаут на выполнение пробы 1 секунда
          livenessProbe: #проверка на жизнь приложения, живо ли оно
            failureThreshold: 3
            httpGet:
              path: /
              port: 80
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 1
            initialDelaySeconds: 10 #означает что первую проверку надо сделать только после 10 секунд

# тут начинается описание лимитов для пода
          resources:
            requests: #количество ресурсов которые резервируются для pod на ноде
              cpu: 60m
              memory: 200Mi
            limits: #количество ресурсов которые pod может использовать(верхняя граница)
              cpu: 120m
              memory: 300Mi

применяем
kubectl create ns my-site
kubectl apply -f my-site-ingress.yaml -f my-site-service.yaml -f my-site.yaml
проверяем

[root@kub-master-1 ~]# kubectl get pod -n my-site
NAME                                    READY   STATUS    RESTARTS   AGE
my-deployment-apache-859486bd8c-zk99f   1/1     Running   0          11m

как видим всё ок.
теперь сделаем так чтобы сервис постоянно падал и перезапускался, для этого подправим в деплойменте проверки(readinessProbe/livenessProbe) порта не 80 а 81:

cat my-site.yaml
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: my-deployment-apache
  namespace: my-site
spec:
  replicas: 1
  selector:
    matchLabels:
      app: apache # по вот этому лейблу репликасет цепляет под
# тут описывается каким мокаром следует обновлять поды
  strategy:
    rollingUpdate:
      maxSurge: 1  # указывает на какое количество реплик можно увеличить
      maxUnavailable: 1 # указывает на какое количество реплик можно уменьшить
#т.е. в одно время при обновлении, будет увеличено на один (новый под) и уменьшено на один (старый под)
    type: RollingUpdate
## тут начинается описание контейнера
  template:
    metadata:
      labels:
        app: apache  # по вот этому лейблу репликасет цепляет под
    spec:
      containers:
        - image: httpd:2.4.43
          name: apache
          ports:
            - containerPort: 80
# тут начинаются проверки по доступности
          readinessProbe: # проверка готово ли приложение
            failureThreshold: 3 #указывает количество провалов при проверке
            httpGet:  # по сути дёргает курлом на 80 порт
              path: /
              port: 81
            periodSeconds: 10 #как часто должна проходить проверка (в секундах)
            successThreshold: 1 #сбрасывает счётчик неудач, т.е. при 3х проверках если 1 раз успешно прошло, то счётчик сбрасывается и всё ок
            timeoutSeconds: 1 #таймаут на выполнение пробы 1 секунда
          livenessProbe: #проверка на жизнь приложения, живо ли оно
            failureThreshold: 3
            httpGet:
              path: /
              port: 81
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 1
            initialDelaySeconds: 10 #означает что первую проверку надо сделать только после 10 секунд

# тут начинается описание лимитов для пода
          resources:
            requests: #количество ресурсов которые резервируются для pod на ноде
              cpu: 60m
              memory: 200Mi
            limits: #количество ресурсов которые pod может использовать(верхняя граница)
              cpu: 120m
              memory: 300Mi

и применим:

kubectl apply -f my-site.yaml

как видим pod перезапускается:

[root@kub-master-1 ~]# kubectl get pod -n my-site
NAME                                    READY   STATUS    RESTARTS   AGE
my-deployment-apache-85978bf68f-mbwlm   0/1     Running   1          41s

но не может пройти проверки.

посмотрим что в метриках на prometheus:

применим promql запрос:

kube_pod_container_status_ready{namespace=»my-site»}[5m]

который смотрит статус контейнеров по namespace my-site за последние 5 минут.

как видим у нас 2 разных имени контейнера:
my-deployment-apache-859486bd8c-zk99f (который был запущен ранее и с ним было всё нормально)
и
my-deployment-apache-85978bf68f-mbwlm (текущий, который был специально сломан через неправильные проверки)

теперь нам надо получить результат, были ли за последние 5 минуть незапущенные контейнеры, для этого используем следующий запрос:

sum_over_time(kube_pod_container_status_ready{namespace=»my-site»}[5m]) <1

который смотрит были ли контейнеры со статусом МЕНЬШЕ 1 (т.е. не запущенные) за 5 минут

для проверки можем увеличить время до 900 минут и глянем что он выведет:

как видим таких было 3 контейнера

возвращаем проверки в деплойменте ждём 5 минут и проверяем статус:

как видим за 5 минут упавших контейнеров не было.

теперь привяжем это к alertmanager.

правим имеющееся правила прометеуса:

kubectl -n monitoring edit prometheusrules prometheus-kube-prometheus-alertmanager.rules

и в общий список где перечисляются правила:

spec:
  groups:
  - name: alertmanager.rules
    rules:
    - alert: AlertmanagerConfigInconsistent
      annotations:
        message: |
          The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
          {{ range printf "alertmanager_config_hash{namespace="%s",service="%s"}" $labels.namespace $labels.service | query }}
          Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
          {{ end }}
      expr: count by(namespace,service) (count_values by(namespace,service) ("config_hash",
        alertmanager_config_hash{job="prometheus-kube-prometheus-alertmanager",namespace="monitoring"}))
        != 1
      for: 5m
      labels:
        severity: critical
    - alert: AlertmanagerFailedReload
      annotations:
        message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
          }}/{{ $labels.pod}}.
      expr: alertmanager_config_last_reload_successful{job="prometheus-kube-prometheus-alertmanager",namespace="monitoring"}
        == 0
      for: 10m
      labels:
        severity: warning
    - alert: AlertmanagerMembersInconsistent
      annotations:
        message: Alertmanager has not found all other members of the cluster.
      expr: |-
        alertmanager_cluster_members{job="prometheus-kube-prometheus-alertmanager",namespace="monitoring"}
          != on (service) GROUP_LEFT()
        count by (service) (alertmanager_cluster_members{job="prometheus-kube-prometheus-alertmanager",namespace="monitoring"})
      for: 5m
      labels:
        severity: critical

добавляем наше:

    - alert: EBNULSA_CONTAINER
      annotations:
        message: CONTAINER_UMER
      expr: sum_over_time(kube_pod_container_status_ready{namespace="my-site"}[5m])
        <1
      for: 1m
      labels:
        severity: critical

выходим сохраняемся.

в прометеусе переходим на вкладку alerts и видим наше правило:

всё теперь можно снова ломать проверки в нашем деплойменте и проверять полетел ли алерт:

как видим полетел.

проверяем наш телеграм бот и видим:

в таком вот виде настраивается алертинг.

Теперь рассмотрим как нам добавлять свой алертинг а не править имеющийся.

смотрим имеющие правила:

[root@kub-master-1 ~]# kubectl -n monitoring get prometheusrules.monitoring.coreos.com
NAME                                                              AGE
prometheus-kube-prometheus-alertmanager.rules                     3d
prometheus-kube-prometheus-etcd                                   3d
prometheus-kube-prometheus-general.rules                          3d
prometheus-kube-prometheus-k8s.rules                              3d
prometheus-kube-prometheus-kube-apiserver-availability.rules      3d
prometheus-kube-prometheus-kube-apiserver-slos                    3d
prometheus-kube-prometheus-kube-apiserver.rules                   3d
prometheus-kube-prometheus-kube-prometheus-general.rules          3d
prometheus-kube-prometheus-kube-prometheus-node-recording.rules   3d
prometheus-kube-prometheus-kube-scheduler.rules                   3d
prometheus-kube-prometheus-kube-state-metrics                     3d
prometheus-kube-prometheus-kubelet.rules                          3d
prometheus-kube-prometheus-kubernetes-apps                        3d
prometheus-kube-prometheus-kubernetes-resources                   3d
prometheus-kube-prometheus-kubernetes-storage                     3d
prometheus-kube-prometheus-kubernetes-system                      3d
prometheus-kube-prometheus-kubernetes-system-apiserver            3d
prometheus-kube-prometheus-kubernetes-system-controller-manager   3d
prometheus-kube-prometheus-kubernetes-system-kubelet              3d
prometheus-kube-prometheus-kubernetes-system-scheduler            3d
prometheus-kube-prometheus-node-exporter                          3d
prometheus-kube-prometheus-node-exporter.rules                    3d
prometheus-kube-prometheus-node-network                           3d
prometheus-kube-prometheus-node.rules                             3d
prometheus-kube-prometheus-prometheus                             3d
prometheus-kube-prometheus-prometheus-operator                    3d

добавляем наше:

cat prometheus-alert-rule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  annotations:
    meta.helm.sh/release-name: prometheus
    meta.helm.sh/release-namespace: monitoring
    prometheus-operator-validated: "true"
  labels:
    app: kube-prometheus-stack
    release: prometheus
  name: my-test-prometheus-alertmanager.rules
  namespace: monitoring
  selfLink: /apis/monitoring.coreos.com/v1/namespaces/monitoring/prometheusrules/my-test-prometheus-alertmanager.rules
spec:
  groups:
  - name: my-test-alertmanager.rules
    rules:
    - alert: EBNULSA_CONTAINER
      annotations:
        message: |
          CONTAINER_UMER
          Namespace: {{ $labels.namespace }} and
          Podname: {{ $labels.pod }}
      expr: sum_over_time(kube_pod_container_status_ready{namespace="my-site"}[5m])
        <1
      for: 1m
      labels:
        severity: critical

применяем:
[root@kub-master-1 ~]# kubectl apply -f prometheus-alert-rule.yaml

проверяем:

[root@kub-master-1 ~]# kubectl -n monitoring get prometheusrules.monitoring.coreos.com | grep my
my-test-prometheus-alertmanager.rules                             91m

как видим наше правило добавилось.

запись вида:
Namespace: {{ $labels.namespace }}
Podname: {{ $labels.pod }}

выведет имя неймспейса и имя пода.

в телеграме это будет отображаться следующим образом:

9.Добавление оповещений и по email

правим файл:
vim charts/kube-prometheus-stack/values.yaml

  ## Alertmanager configuration directives
  ## ref: https://prometheus.io/docs/alerting/configuration/#configuration-file
  ##      https://prometheus.io/webtools/alerting/routing-tree-editor/
  ##

  config:
    global:
      resolve_timeout: 5m
      smtp_smarthost: 10.20.44.56:25
    route:
       # receiver: 'telegram'
        receiver: 'email_unixadmins'
        routes:
        - receiver: "telegram"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            severity: "critical"
          continue: true
        - receiver: "telegram"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            alertname: "Watchdog"
          continue: true
        - receiver: "email_unixadmins"
          group_wait: 10s
          repeat_interval: 48h
          match_re:
            severity: "critical"
          continue: true
    receivers:
        - name: 'telegram'
          webhook_configs:
              - send_resolved: true
                url: 'http://alertmanager-bot:8080'

        - name: 'email_unixadmins'
          email_configs:
              - to: 'admin1@test.ru'
                from: 'prod-vsrv-kuber-alertmanager@test.ru'
                require_tls: false
                send_resolved: true
              - to: 'admin2@test.ru'
                from: 'prod-vsrv-kuber-alertmanager@test.ru'
                require_tls: false
                send_resolved: true

smtp_smarthost: 10.20.44.56:25 это наш smtp хост через который мы шлём почту.

receiver: ’email_unixadmins’ — на него будут идти оповещения вне зависимости от критичности алерта, для остальных можно выставлять уровень критичности.

и применяем:

helm upgrade —install -name prometheus kube-prometheus-stack/ -f kube-prometheus-stack/values.yaml —namespace monitoring