From dde0d190d5128637da311874e207efe0dc3d2860 Mon Sep 17 00:00:00 2001 From: Ivan Borshukov Date: Fri, 10 Feb 2017 12:40:03 +0200 Subject: [PATCH] mozzle: Emit both client and server HTTP metrics The firehose provides metrics for communication between the CF router and the application instances, and the end user and CF router. Now both metrics are propagated via mozzle. The way to distinguish them is via the 'peer' metric attribute, which has value of either 'server' or 'client'. The attribute describes where the metrics are actually measured: HTTP HTTP user <----> router <----> application instance server client * server - metrics are measured as the router is writing the response back to the client. * client - metrics are measured as the router is requesting an application instance. Fixes https://github.com/Bo0mer/mozzle/issues/7. --- demo/mib/dashboards/http_statistics.json | 327 +++++++++++++++++++---- demo/mib/dashboards/overview.json | 16 +- doc.go | 6 +- httpmetrics.go | 31 ++- 4 files changed, 315 insertions(+), 65 deletions(-) diff --git a/demo/mib/dashboards/http_statistics.json b/demo/mib/dashboards/http_statistics.json index 7c276f5..912eb5f 100644 --- a/demo/mib/dashboards/http_statistics.json +++ b/demo/mib/dashboards/http_statistics.json @@ -231,7 +231,7 @@ }, "refresh": "30s", "schemaVersion": 13, - "version": 35, + "version": 40, "links": [ { "icon": "external link", @@ -298,7 +298,7 @@ } ], "policy": "default", - "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"application\" =~ /^$app_name$/ AND $timeFilter GROUP BY \"status_code\"", + "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"application\" =~ /^$app_name$/ AND \"peer\" =~ /^server$/ AND $timeFilter GROUP BY \"status_code\"", "rawQuery": true, "refId": "A", "resultFormat": "table", @@ -374,7 +374,7 @@ "steppedLine": false, "targets": [ { - "alias": "mean", + "alias": "99", "dsType": "influxdb", "groupBy": [ { @@ -394,7 +394,7 @@ "policy": "default", "query": "SELECT \"value\" FROM \"http response content_length_bytes\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"application\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($timeInterval)", "rawQuery": false, - "refId": "A", + "refId": "C", "resultFormat": "time_series", "select": [ [ @@ -405,8 +405,10 @@ "type": "field" }, { - "params": [], - "type": "mean" + "params": [ + "99" + ], + "type": "percentile" } ] ], @@ -427,6 +429,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ] }, @@ -486,11 +494,17 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ] }, { - "alias": "99", + "alias": "mean", "dsType": "influxdb", "groupBy": [ { @@ -510,7 +524,7 @@ "policy": "default", "query": "SELECT \"value\" FROM \"http response content_length_bytes\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"application\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($timeInterval)", "rawQuery": false, - "refId": "C", + "refId": "A", "resultFormat": "time_series", "select": [ [ @@ -521,10 +535,8 @@ "type": "field" }, { - "params": [ - "99" - ], - "type": "percentile" + "params": [], + "type": "mean" } ] ], @@ -545,6 +557,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ] } @@ -680,6 +698,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ] } @@ -810,6 +834,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ] } @@ -868,11 +898,11 @@ "renderer": "flot", "seriesOverrides": [], "span": 12, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "alias": "25", + "alias": "99", "dsType": "influxdb", "groupBy": [ { @@ -880,19 +910,75 @@ "$summarize" ], "type": "time" + } + ], + "hide": false, + "measurement": "http response time_ms", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"http response time_ms\" WHERE \"state\" = 'ok' AND \"host\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($interval), \"method\" fill(null)", + "rawQuery": false, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + "99" + ], + "type": "percentile" + } + ] + ], + "tags": [ + { + "key": "org", + "operator": "=~", + "value": "/^$org_name$/" + }, + { + "condition": "AND", + "key": "space", + "operator": "=~", + "value": "/^$space_name$/" + }, + { + "condition": "AND", + "key": "application", + "operator": "=~", + "value": "/^$app_name$/" }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" + } + ], + "target": "" + }, + { + "alias": "95", + "dsType": "influxdb", + "groupBy": [ { "params": [ - "null" + "$summarize" ], - "type": "fill" + "type": "time" } ], + "hide": false, "measurement": "http response time_ms", "policy": "default", "query": "SELECT mean(\"value\") FROM \"http response time_ms\" WHERE \"state\" = 'ok' AND \"host\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($interval), \"method\" fill(null)", "rawQuery": false, - "refId": "D", + "refId": "B", "resultFormat": "time_series", "select": [ [ @@ -904,7 +990,7 @@ }, { "params": [ - "25" + 95 ], "type": "percentile" } @@ -927,6 +1013,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ], "target": "" @@ -940,14 +1032,9 @@ "$summarize" ], "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" } ], + "hide": false, "measurement": "http response time_ms", "policy": "default", "query": "SELECT mean(\"value\") FROM \"http response time_ms\" WHERE \"state\" = 'ok' AND \"host\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($interval), \"method\" fill(null)", @@ -985,12 +1072,18 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ], "target": "" }, { - "alias": "95", + "alias": "25", "dsType": "influxdb", "groupBy": [ { @@ -998,12 +1091,6 @@ "$summarize" ], "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" } ], "hide": false, @@ -1011,7 +1098,7 @@ "policy": "default", "query": "SELECT mean(\"value\") FROM \"http response time_ms\" WHERE \"state\" = 'ok' AND \"host\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($interval), \"method\" fill(null)", "rawQuery": false, - "refId": "B", + "refId": "D", "resultFormat": "time_series", "select": [ [ @@ -1023,7 +1110,7 @@ }, { "params": [ - 95 + "25" ], "type": "percentile" } @@ -1046,12 +1133,99 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ], "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Response time (router)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true }, { - "alias": "99", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "showTitle": false, + "titleSize": "h6", + "height": 250, + "repeat": null, + "repeatRowId": null, + "repeatIteration": null, + "collapse": false + }, + { + "title": "Dashboard Row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "InfluxDB", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "alias": "application mean", "dsType": "influxdb", "groupBy": [ { @@ -1059,20 +1233,71 @@ "$summarize" ], "type": "time" + } + ], + "measurement": "http response time_ms", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"http response time_ms\" WHERE \"state\" = 'ok' AND \"host\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($interval), \"method\" fill(null)", + "rawQuery": false, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "org", + "operator": "=~", + "value": "/^$org_name$/" + }, + { + "condition": "AND", + "key": "space", + "operator": "=~", + "value": "/^$space_name$/" + }, + { + "condition": "AND", + "key": "application", + "operator": "=~", + "value": "/^$app_name$/" }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "client" + } + ], + "target": "" + }, + { + "alias": "router mean", + "dsType": "influxdb", + "groupBy": [ { "params": [ - "null" + "$summarize" ], - "type": "fill" + "type": "time" } ], - "hide": false, "measurement": "http response time_ms", "policy": "default", "query": "SELECT mean(\"value\") FROM \"http response time_ms\" WHERE \"state\" = 'ok' AND \"host\" =~ /^$app_name$/ AND $timeFilter GROUP BY time($interval), \"method\" fill(null)", "rawQuery": false, - "refId": "C", + "refId": "B", "resultFormat": "time_series", "select": [ [ @@ -1083,10 +1308,8 @@ "type": "field" }, { - "params": [ - "99" - ], - "type": "percentile" + "params": [], + "type": "mean" } ] ], @@ -1107,6 +1330,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ], "target": "" @@ -1115,7 +1344,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "HTTP Response times", + "title": "HTTP Response time (app vs router)", "tooltip": { "msResolution": true, "shared": true, @@ -1133,9 +1362,9 @@ { "format": "ms", "label": "", - "logBase": 1, + "logBase": 10, "max": null, - "min": "0", + "min": 0, "show": true }, { @@ -1251,6 +1480,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ], "target": "" @@ -1259,7 +1494,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "HTTP Response time (detailed)", + "title": "HTTP Response time (router)", "tooltip": { "msResolution": true, "shared": true, diff --git a/demo/mib/dashboards/overview.json b/demo/mib/dashboards/overview.json index b2fa5af..3789bac 100644 --- a/demo/mib/dashboards/overview.json +++ b/demo/mib/dashboards/overview.json @@ -239,7 +239,7 @@ }, "refresh": "30s", "schemaVersion": 13, - "version": 22, + "version": 25, "links": [], "gnetId": null, "rows": [ @@ -1169,7 +1169,7 @@ ], "measurement": "http response time_ms", "policy": "default", - "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"host\" =~ /^$app_name$/ AND \"state\" = 'ok' AND $timeFilter", + "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"application\" =~ /^$app_name$/ AND \"state\" = 'ok' AND \"peer\" = 'server' AND $timeFilter", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -1286,7 +1286,7 @@ "hide": false, "measurement": "http response time_ms", "policy": "default", - "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"host\" =~ /^$app_name$/ AND \"state\" = 'ok' AND \"status_code\" =~ /^2[0-9]+$/ AND $timeFilter", + "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"host\" =~ /^$app_name$/ AND \"peer\" = 'server' AND \"state\" = 'ok' AND \"status_code\" =~ /^2[0-9]+$/ AND $timeFilter", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -1408,7 +1408,7 @@ "hide": false, "measurement": "http response time_ms", "policy": "default", - "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"host\" =~ /^$app_name$/ AND \"state\" = 'ok' AND \"status_code\" =~ /^5[0-9]+$/ AND $timeFilter", + "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"host\" =~ /^$app_name$/ AND \"peer\" = 'server' AND \"state\" = 'ok' AND \"status_code\" =~ /^5[0-9]+$/ AND $timeFilter", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -1530,7 +1530,7 @@ "hide": false, "measurement": "http response time_ms", "policy": "default", - "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"host\" =~ /^$app_name$/ AND \"state\" = 'ok' AND \"status_code\" =~ /^4[0-9]+$/ AND $timeFilter", + "query": "SELECT count(\"value\") FROM \"http response time_ms\" WHERE \"org\" =~ /^$org_name$/ AND \"space\" =~ /^$space_name$/ AND \"host\" =~ /^$app_name$/ AND \"peer\" = 'server' AND \"state\" = 'ok' AND \"status_code\" =~ /^4[0-9]+$/ AND $timeFilter", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -1869,6 +1869,12 @@ "key": "application", "operator": "=~", "value": "/^$app_name$/" + }, + { + "condition": "AND", + "key": "peer", + "operator": "=", + "value": "server" } ], "target": "" diff --git a/doc.go b/doc.go index c9ef7c2..30e6229 100644 --- a/doc.go +++ b/doc.go @@ -26,7 +26,11 @@ // org, space, name, id, and the insntace index (when appropriate). // // Additionally, the HTTP events have attributes specifying the method, -// request_id, content length and the returned status code. +// request_id, content length the returned status code and the peer type. +// There are two peer types - client and server. Client means that measurements +// are recorded via the Cloud Foundry router's HTTP client that requested the +// application container and server means that the measurements are recorded +// for responding to the end user via the router server. // // The application event metrics have attributes that describe the event's // actor and actee, as well as their ids. diff --git a/httpmetrics.go b/httpmetrics.go index 62f5f34..325ef61 100644 --- a/httpmetrics.go +++ b/httpmetrics.go @@ -20,20 +20,25 @@ func (r httpMetrics) EmitTo(e Emitter) { switch r.GetPeerType() { case cfevent.PeerType_Client: - durationMillis := (r.GetStopTimestamp() - r.GetStartTimestamp()) / 1000000 - e.Emit(forApp(r.App, Metric{ - Service: "http response time_ms", - Metric: int(durationMillis), - State: "ok", - Attributes: attributes, - })) + attributes["peer"] = "client" case cfevent.PeerType_Server: - e.Emit(forApp(r.App, Metric{ - Service: "http response content_length_bytes", - Metric: int(r.GetContentLength()), - State: "ok", - Attributes: attributes, - })) + attributes["peer"] = "server" + default: + attributes["peer"] = "unknown" } + durationMillis := (r.GetStopTimestamp() - r.GetStartTimestamp()) / 1000000 + e.Emit(forApp(r.App, Metric{ + Service: "http response time_ms", + Metric: int(durationMillis), + State: "ok", + Attributes: attributes, + })) + e.Emit(forApp(r.App, Metric{ + Service: "http response content_length_bytes", + Metric: int(r.GetContentLength()), + State: "ok", + Attributes: attributes, + })) + }