Commit ef63f4c5 authored by Geoff Simmons's avatar Geoff Simmons

Add metrics to pkg/varnish.

parent b913502c
/*
* Copyright (c) 2019 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Geoffrey Simmons <geoffrey.simmons@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
package varnish
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
)
const (
namespace = "varnishingctl"
subsystem = "varnish"
)
type instanceMetrics struct {
updates prometheus.Counter
updateErrs prometheus.Counter
connectFails prometheus.Counter
vclLoads prometheus.Counter
vclLoadErrs prometheus.Counter
connectLatency prometheus.Summary
vclLoadLatency prometheus.Summary
pings prometheus.Counter
pingFails prometheus.Counter
panics prometheus.Counter
childRunning prometheus.Counter
childNotRunning prometheus.Counter
vclDiscards prometheus.Counter
monitorChecks prometheus.Counter
}
var (
svcsGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "services",
Help: "Current number of managed Varnish services",
})
instsGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "instances",
Help: "Current number of managed Varnish instances",
})
secretsGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "secrets",
Help: "Current number of known admin secrets",
})
addr2instMetrics map[string]*instanceMetrics = make(map[string]*instanceMetrics)
instMetricsMtx = &sync.Mutex{}
latencyObjectives map[float64]float64 = map[float64]float64{
0.5: 0.001,
0.9: 0.001,
0.95: 0.001,
0.99: 0.001,
0.999: 0.001,
}
)
func initMetrics() {
prometheus.Register(svcsGauge)
prometheus.Register(instsGauge)
prometheus.Register(secretsGauge)
}
func getInstanceMetrics(addr string) *instanceMetrics {
instMetricsMtx.Lock()
defer instMetricsMtx.Unlock()
metrics, exists := addr2instMetrics[addr]
if exists {
return metrics
}
labels := make(map[string]string)
labels["varnish_instance"] = addr
metrics = &instanceMetrics{
updates: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "updates_total",
Help: "Total number of attempted updates",
ConstLabels: labels,
}),
updateErrs: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "update_errors_total",
Help: "Total number of update errors",
ConstLabels: labels,
}),
connectFails: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "admin_connect_fails_total",
Help: "Total number of admin connection failures",
ConstLabels: labels,
}),
vclLoads: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "vcl_loads_total",
Help: "Total number of successful VCL loads",
ConstLabels: labels,
}),
vclLoadErrs: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "vcl_load_errors_total",
Help: "Total number of VCL load errors",
ConstLabels: labels,
}),
connectLatency: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "admin_connect_latency_seconds",
Help: "Admin connection latency",
ConstLabels: labels,
Objectives: latencyObjectives,
}),
vclLoadLatency: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "vcl_load_latency_seconds",
Help: "VCL load latency",
ConstLabels: labels,
Objectives: latencyObjectives,
}),
pings: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "pings_total",
Help: "Total number of successful pings",
ConstLabels: labels,
}),
pingFails: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "ping_errors_total",
Help: "Total number of ping errors",
ConstLabels: labels,
}),
panics: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "panics_total",
Help: "Total number of panics detected",
ConstLabels: labels,
}),
childRunning: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "child_running_total",
Help: "Total number of monitor runs with the " +
"child process in the running state",
ConstLabels: labels,
}),
childNotRunning: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "child_not_running_total",
Help: "Total number of monitor runs with the " +
"child process not in the running state",
ConstLabels: labels,
}),
vclDiscards: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "vcl_discards_total",
Help: "Total number of VCL discards",
ConstLabels: labels,
}),
monitorChecks: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "monitor_checks_total",
Help: "Total number of monitor checks",
ConstLabels: labels,
}),
}
prometheus.Register(metrics.updates)
prometheus.Register(metrics.updateErrs)
prometheus.Register(metrics.connectFails)
prometheus.Register(metrics.vclLoads)
prometheus.Register(metrics.vclLoadErrs)
prometheus.Register(metrics.connectLatency)
prometheus.Register(metrics.vclLoadLatency)
prometheus.Register(metrics.pings)
prometheus.Register(metrics.pingFails)
prometheus.Register(metrics.panics)
prometheus.Register(metrics.childRunning)
prometheus.Register(metrics.childNotRunning)
prometheus.Register(metrics.vclDiscards)
prometheus.Register(metrics.monitorChecks)
addr2instMetrics[addr] = metrics
return metrics
}
......@@ -33,6 +33,7 @@ import (
"time"
"code.uplex.de/uplex-varnish/varnishapi/pkg/admin"
"github.com/prometheus/client_golang/prometheus"
)
const (
......@@ -71,6 +72,9 @@ func (vc *VarnishController) errorEvt(svc, reason, msgFmt string,
}
func (vc *VarnishController) checkInst(svc string, inst *varnishInst) bool {
metrics := getInstanceMetrics(inst.addr)
metrics.monitorChecks.Inc()
if inst.admSecret == nil {
vc.warnEvt(svc, noAdmSecret,
"No admin secret known for endpoint %s", inst.addr)
......@@ -79,8 +83,11 @@ func (vc *VarnishController) checkInst(svc string, inst *varnishInst) bool {
inst.admMtx.Lock()
defer inst.admMtx.Unlock()
timer := prometheus.NewTimer(metrics.connectLatency)
adm, err := admin.Dial(inst.addr, *inst.admSecret, admTimeout)
timer.ObserveDuration()
if err != nil {
metrics.connectFails.Inc()
vc.errorEvt(svc, connectErr, "Error connecting to %s: %v",
inst.addr, err)
return false
......@@ -92,10 +99,12 @@ func (vc *VarnishController) checkInst(svc string, inst *varnishInst) bool {
pong, err := adm.Ping()
if err != nil {
metrics.pingFails.Inc()
vc.errorEvt(svc, pingErr, "Error pinging at %s: %v", inst.addr,
err)
return false
}
metrics.pings.Inc()
vc.log.Infof("Succesfully pinged instance %s: %+v", inst.addr, pong)
state, err := adm.Status()
......@@ -105,8 +114,10 @@ func (vc *VarnishController) checkInst(svc string, inst *varnishInst) bool {
return false
}
if state == admin.Running {
metrics.childRunning.Inc()
vc.log.Infof("Status at %s: %s", inst.addr, state)
} else {
metrics.childNotRunning.Inc()
vc.warnEvt(svc, statusNotRun, "Status at %s: %s", inst.addr,
state)
}
......@@ -120,6 +131,7 @@ func (vc *VarnishController) checkInst(svc string, inst *varnishInst) bool {
if panic == "" {
vc.log.Infof("No panic at %s", inst.addr)
} else {
metrics.panics.Inc()
vc.errorEvt(svc, panic, "Panic at %s: %s", inst.addr, panic)
// XXX clear the panic? Should be configurable
}
......@@ -139,6 +151,7 @@ func (vc *VarnishController) checkInst(svc string, inst *varnishInst) bool {
"%v", vcl.Name, inst.addr, err)
return false
}
metrics.vclDiscards.Inc()
vc.log.Infof("Discarded VCL %s at %s", vcl.Name,
inst.addr)
}
......
......@@ -48,6 +48,7 @@ import (
"code.uplex.de/uplex-varnish/k8s-ingress/pkg/varnish/vcl"
"code.uplex.de/uplex-varnish/varnishapi/pkg/admin"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)
......@@ -164,6 +165,7 @@ func NewVarnishController(
if err := vcl.InitTemplates(tmplDir); err != nil {
return nil, err
}
initMetrics()
return &VarnishController{
svcs: make(map[string]*varnishSvc),
secrets: make(map[string]*[]byte),
......@@ -189,11 +191,7 @@ func (vc *VarnishController) Start(errChan chan error) {
}
func (vc *VarnishController) updateVarnishInstance(inst *varnishInst,
cfgName string, vclSrc string) error {
if inst == nil {
return fmt.Errorf("Instance object is nil")
}
cfgName string, vclSrc string, metrics *instanceMetrics) error {
vc.log.Infof("Update Varnish instance at %s", inst.addr)
vc.log.Debugf("Varnish instance %s: %+v", inst.addr, *inst)
......@@ -204,8 +202,11 @@ func (vc *VarnishController) updateVarnishInstance(inst *varnishInst,
defer inst.admMtx.Unlock()
vc.log.Debugf("Connect to %s, timeout=%v", inst.addr, admTimeout)
timer := prometheus.NewTimer(metrics.connectLatency)
adm, err := admin.Dial(inst.addr, *inst.admSecret, admTimeout)
timer.ObserveDuration()
if err != nil {
metrics.connectFails.Inc()
return err
}
defer adm.Close()
......@@ -238,12 +239,16 @@ func (vc *VarnishController) updateVarnishInstance(inst *varnishInst,
inst.addr)
} else {
vc.log.Debugf("Load config %s at %s", cfgName, inst.addr)
timer = prometheus.NewTimer(metrics.vclLoadLatency)
err = adm.VCLInline(cfgName, vclSrc)
timer.ObserveDuration()
if err != nil {
vc.log.Debugf("Error loading config %s at %s: %v",
cfgName, inst.addr, err)
metrics.vclLoadErrs.Inc()
return err
}
metrics.vclLoads.Inc()
vc.log.Infof("Loaded config %s at Varnish endpoint %s", cfgName,
inst.addr)
}
......@@ -304,9 +309,18 @@ func (vc *VarnishController) updateVarnishSvc(name string) error {
vc.log.Infof("Update Varnish instances: load config %s", cfgName)
var errs VarnishAdmErrors
for _, inst := range svc.instances {
if e := vc.updateVarnishInstance(inst, cfgName, vclSrc); e != nil {
if inst == nil {
vc.log.Errorf("Instance object is nil")
continue
}
metrics := getInstanceMetrics(inst.addr)
metrics.updates.Inc()
if e := vc.updateVarnishInstance(inst, cfgName, vclSrc,
metrics); e != nil {
admErr := VarnishAdmError{addr: inst.addr, err: e}
errs = append(errs, admErr)
metrics.updateErrs.Inc()
continue
}
}
......@@ -329,17 +343,21 @@ func (vc *VarnishController) setCfgLabel(inst *varnishInst,
err: fmt.Errorf("No known admin secret"),
}
}
metrics := getInstanceMetrics(inst.addr)
inst.admMtx.Lock()
defer inst.admMtx.Unlock()
vc.log.Debugf("Connect to %s, timeout=%v", inst.addr, admTimeout)
timer := prometheus.NewTimer(metrics.connectLatency)
adm, err := admin.Dial(inst.addr, *inst.admSecret, admTimeout)
timer.ObserveDuration()
if err != nil {
if mayClose {
vc.log.Warnf("Could not connect to %s: %v", inst.addr,
err)
return nil
}
metrics.connectFails.Inc()
return VarnishAdmError{addr: inst.addr, err: err}
}
defer adm.Close()
......@@ -373,6 +391,7 @@ func (vc *VarnishController) removeVarnishInstances(insts []*varnishInst) error
errs = append(errs, admErr)
continue
}
instsGauge.Dec()
}
if len(errs) == 0 {
return nil
......@@ -413,6 +432,7 @@ func (vc *VarnishController) updateVarnishSvcAddrs(key string,
admMtx: &sync.Mutex{},
}
newInsts = append(newInsts, newInst)
instsGauge.Inc()
}
for addr, inst := range prevAddrs {
_, exists := updateAddrs[addr]
......@@ -435,6 +455,7 @@ func (vc *VarnishController) updateVarnishSvcAddrs(key string,
errs = append(errs, admErr)
continue
}
instsGauge.Dec()
}
vc.log.Debugf("Varnish svc %s config: %+v", key, *svc)
......@@ -482,9 +503,11 @@ func (vc *VarnishController) AddOrUpdateVarnishSvc(key string,
vc.log.Debugf("Varnish svc %s: creating instance %+v",
key, *instance)
instances = append(instances, instance)
instsGauge.Inc()
}
svc.instances = instances
vc.svcs[key] = svc
svcsGauge.Inc()
vc.log.Debugf("Varnish svc %s: created config", key)
}
vc.log.Debugf("Varnish svc %s config: %+v", key, svc)
......@@ -518,6 +541,7 @@ func (vc *VarnishController) DeleteVarnishSvc(key string) error {
err := vc.removeVarnishInstances(svc.instances)
if err != nil {
delete(vc.svcs, key)
svcsGauge.Dec()
}
return err
}
......@@ -535,6 +559,7 @@ func (vc *VarnishController) Update(
if !exists {
svc = &varnishSvc{instances: make([]*varnishInst, 0)}
vc.svcs[svcKey] = svc
svcsGauge.Inc()
vc.log.Infof("Added Varnish service definition %s for Ingress "+
"%s uid=%s", svcKey, ingKey, uid)
}
......@@ -621,6 +646,7 @@ func (vc *VarnishController) SetAdmSecret(key string, secret []byte) {
secretSlice := make([]byte, len(secret))
secr = &secretSlice
vc.secrets[key] = secr
secretsGauge.Inc()
}
copy(*vc.secrets[key], secret)
}
......@@ -645,6 +671,7 @@ func (vc *VarnishController) UpdateSvcForSecret(svcKey, secretKey string) error
svcKey, secretKey)
svc = &varnishSvc{instances: make([]*varnishInst, 0)}
vc.svcs[svcKey] = svc
svcsGauge.Inc()
}
svc.secrName = secretKey
......@@ -664,6 +691,7 @@ func (vc *VarnishController) DeleteAdmSecret(name string) {
_, exists := vc.secrets[name]
if exists {
delete(vc.secrets, name)
secretsGauge.Dec()
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment