graphTransform.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. import {
  2. DataFrame,
  3. DataFrameView,
  4. DataQueryResponse,
  5. FieldColorModeId,
  6. FieldDTO,
  7. MutableDataFrame,
  8. NodeGraphDataFrameFieldNames as Fields,
  9. TimeRange,
  10. } from '@grafana/data';
  11. import { getNonOverlappingDuration, getStats, makeFrames, makeSpanMap } from '../../../core/utils/tracing';
  12. /**
  13. * Row in a trace dataFrame
  14. */
  15. interface Row {
  16. traceID: string;
  17. spanID: string;
  18. parentSpanID: string;
  19. operationName: string;
  20. serviceName: string;
  21. serviceTags: string;
  22. startTime: number;
  23. duration: number;
  24. logs: string;
  25. tags: string;
  26. }
  27. interface Node {
  28. [Fields.id]: string;
  29. [Fields.title]: string;
  30. [Fields.subTitle]: string;
  31. [Fields.mainStat]: string;
  32. [Fields.secondaryStat]: string;
  33. [Fields.color]: number;
  34. }
  35. interface Edge {
  36. [Fields.id]: string;
  37. [Fields.target]: string;
  38. [Fields.source]: string;
  39. }
  40. export function createGraphFrames(data: DataFrame): DataFrame[] {
  41. const { nodes, edges } = convertTraceToGraph(data);
  42. const [nodesFrame, edgesFrame] = makeFrames();
  43. for (const node of nodes) {
  44. nodesFrame.add(node);
  45. }
  46. for (const edge of edges) {
  47. edgesFrame.add(edge);
  48. }
  49. return [nodesFrame, edgesFrame];
  50. }
  51. function convertTraceToGraph(data: DataFrame): { nodes: Node[]; edges: Edge[] } {
  52. const nodes: Node[] = [];
  53. const edges: Edge[] = [];
  54. const view = new DataFrameView<Row>(data);
  55. const traceDuration = findTraceDuration(view);
  56. const spanMap = makeSpanMap((index) => {
  57. if (index >= data.length) {
  58. return undefined;
  59. }
  60. const span = view.get(index);
  61. return {
  62. span: { ...span },
  63. id: span.spanID,
  64. parentIds: span.parentSpanID ? [span.parentSpanID] : [],
  65. };
  66. });
  67. for (let i = 0; i < view.length; i++) {
  68. const row = view.get(i);
  69. const ranges: Array<[number, number]> = spanMap[row.spanID].children.map((c) => {
  70. const span = spanMap[c].span;
  71. return [span.startTime, span.startTime + span.duration];
  72. });
  73. const childrenDuration = getNonOverlappingDuration(ranges);
  74. const selfDuration = row.duration - childrenDuration;
  75. const stats = getStats(row.duration, traceDuration, selfDuration);
  76. nodes.push({
  77. [Fields.id]: row.spanID,
  78. [Fields.title]: row.serviceName ?? '',
  79. [Fields.subTitle]: row.operationName,
  80. [Fields.mainStat]: stats.main,
  81. [Fields.secondaryStat]: stats.secondary,
  82. [Fields.color]: selfDuration / traceDuration,
  83. });
  84. // Sometimes some span can be missing. Don't add edges for those.
  85. if (row.parentSpanID && spanMap[row.parentSpanID].span) {
  86. edges.push({
  87. [Fields.id]: row.parentSpanID + '--' + row.spanID,
  88. [Fields.target]: row.spanID,
  89. [Fields.source]: row.parentSpanID,
  90. });
  91. }
  92. }
  93. return { nodes, edges };
  94. }
  95. /**
  96. * Get the duration of the whole trace as it isn't a part of the response data.
  97. * Note: Seems like this should be the same as just longest span, but this is probably safer.
  98. */
  99. function findTraceDuration(view: DataFrameView<Row>): number {
  100. let traceEndTime = 0;
  101. let traceStartTime = Infinity;
  102. for (let i = 0; i < view.length; i++) {
  103. const row = view.get(i);
  104. if (row.startTime < traceStartTime) {
  105. traceStartTime = row.startTime;
  106. }
  107. if (row.startTime + row.duration > traceEndTime) {
  108. traceEndTime = row.startTime + row.duration;
  109. }
  110. }
  111. return traceEndTime - traceStartTime;
  112. }
  113. export const secondsMetric = 'traces_service_graph_request_server_seconds_sum';
  114. export const totalsMetric = 'traces_service_graph_request_total';
  115. export const failedMetric = 'traces_service_graph_request_failed_total';
  116. export const histogramMetric = 'traces_service_graph_request_server_seconds_bucket';
  117. export const serviceMapMetrics = [
  118. secondsMetric,
  119. totalsMetric,
  120. failedMetric,
  121. histogramMetric,
  122. // These are used for debugging the tempo collection so probably not useful for service map right now.
  123. // 'traces_service_graph_unpaired_spans_total',
  124. // 'traces_service_graph_untagged_spans_total',
  125. ];
  126. /**
  127. * Map response from multiple prometheus metrics into a node graph data frames with nodes and edges.
  128. * @param responses
  129. * @param range
  130. */
  131. export function mapPromMetricsToServiceMap(
  132. responses: DataQueryResponse[],
  133. range: TimeRange
  134. ): { nodes: DataFrame; edges: DataFrame } {
  135. const frames = getMetricFrames(responses);
  136. // First just collect data from the metrics into a map with nodes and edges as keys
  137. const nodesMap: Record<string, ServiceMapStatistics> = {};
  138. const edgesMap: Record<string, EdgeObject> = {};
  139. // At this moment we don't have any error/success or other counts so we just use these 2
  140. collectMetricData(frames[totalsMetric], 'total', totalsMetric, nodesMap, edgesMap);
  141. collectMetricData(frames[secondsMetric], 'seconds', secondsMetric, nodesMap, edgesMap);
  142. collectMetricData(frames[failedMetric], 'failed', failedMetric, nodesMap, edgesMap);
  143. return convertToDataFrames(nodesMap, edgesMap, range);
  144. }
  145. function createServiceMapDataFrames() {
  146. function createDF(name: string, fields: FieldDTO[]) {
  147. return new MutableDataFrame({ name, fields, meta: { preferredVisualisationType: 'nodeGraph' } });
  148. }
  149. const nodes = createDF('Nodes', [
  150. { name: Fields.id },
  151. { name: Fields.title, config: { displayName: 'Service name' } },
  152. { name: Fields.mainStat, config: { unit: 'ms/r', displayName: 'Average response time' } },
  153. {
  154. name: Fields.secondaryStat,
  155. config: { unit: 'r/sec', displayName: 'Requests per second' },
  156. },
  157. {
  158. name: Fields.arc + 'success',
  159. config: { displayName: 'Success', color: { fixedColor: 'green', mode: FieldColorModeId.Fixed } },
  160. },
  161. {
  162. name: Fields.arc + 'failed',
  163. config: { displayName: 'Failed', color: { fixedColor: 'red', mode: FieldColorModeId.Fixed } },
  164. },
  165. ]);
  166. const edges = createDF('Edges', [
  167. { name: Fields.id },
  168. { name: Fields.source },
  169. { name: Fields.target },
  170. { name: Fields.mainStat, config: { unit: 'r', displayName: 'Requests' } },
  171. { name: Fields.secondaryStat, config: { unit: 'ms/r', displayName: 'Average response time' } },
  172. ]);
  173. return [nodes, edges];
  174. }
  175. /**
  176. * Group frames from response based on ref id which is set the same as the metric name so we know which metric is where
  177. * and also put it into DataFrameView so it's easier to work with.
  178. * @param responses
  179. */
  180. function getMetricFrames(responses: DataQueryResponse[]): Record<string, DataFrameView> {
  181. return responses[0].data.reduce<Record<string, DataFrameView>>((acc, frame) => {
  182. acc[frame.refId] = new DataFrameView(frame);
  183. return acc;
  184. }, {});
  185. }
  186. type ServiceMapStatistics = {
  187. total?: number;
  188. seconds?: number;
  189. failed?: number;
  190. };
  191. type EdgeObject = ServiceMapStatistics & {
  192. source: string;
  193. target: string;
  194. };
  195. /**
  196. * Collect data from a metric into a map of nodes and edges. The metric data is modeled as counts of metric per edge
  197. * which is a pair of client-server nodes. This means we convert each row of the metric 1-1 to edges and than we assign
  198. * the metric also to server. We count the stats for server only as we show requests/transactions that particular node
  199. * processed not those which it generated and other stats like average transaction time then stem from that.
  200. * @param frame
  201. * @param stat
  202. * @param metric
  203. * @param nodesMap
  204. * @param edgesMap
  205. */
  206. function collectMetricData(
  207. frame: DataFrameView | undefined,
  208. stat: keyof ServiceMapStatistics,
  209. metric: string,
  210. nodesMap: Record<string, ServiceMapStatistics>,
  211. edgesMap: Record<string, EdgeObject>
  212. ) {
  213. if (!frame) {
  214. return;
  215. }
  216. // The name of the value column is in this format
  217. // TODO figure out if it can be changed
  218. const valueName = `Value #${metric}`;
  219. for (let i = 0; i < frame.length; i++) {
  220. const row = frame.get(i);
  221. const edgeId = `${row.client}_${row.server}`;
  222. if (!edgesMap[edgeId]) {
  223. // Create edge as it does not exist yet
  224. edgesMap[edgeId] = {
  225. target: row.server,
  226. source: row.client,
  227. [stat]: row[valueName],
  228. };
  229. } else {
  230. // Add stat to edge
  231. // We are adding the values if exists but that should not happen in general as there should be single row for
  232. // an edge.
  233. edgesMap[edgeId][stat] = (edgesMap[edgeId][stat] || 0) + row[valueName];
  234. }
  235. if (!nodesMap[row.server]) {
  236. // Create node for server
  237. nodesMap[row.server] = {
  238. [stat]: row[valueName],
  239. };
  240. } else {
  241. // Add stat to server node. Sum up values if there are multiple edges targeting this server node.
  242. nodesMap[row.server][stat] = (nodesMap[row.server][stat] || 0) + row[valueName];
  243. }
  244. if (!nodesMap[row.client]) {
  245. // Create the client node but don't add the stat as edge stats are attributed to the server node. This means for
  246. // example that the number of requests in a node show how many requests it handled not how many it generated.
  247. nodesMap[row.client] = {
  248. [stat]: 0,
  249. };
  250. }
  251. }
  252. }
  253. function convertToDataFrames(
  254. nodesMap: Record<string, ServiceMapStatistics>,
  255. edgesMap: Record<string, EdgeObject>,
  256. range: TimeRange
  257. ): { nodes: DataFrame; edges: DataFrame } {
  258. const rangeMs = range.to.valueOf() - range.from.valueOf();
  259. const [nodes, edges] = createServiceMapDataFrames();
  260. for (const nodeId of Object.keys(nodesMap)) {
  261. const node = nodesMap[nodeId];
  262. nodes.add({
  263. [Fields.id]: nodeId,
  264. [Fields.title]: nodeId,
  265. // NaN will not be shown in the node graph. This happens for a root client node which did not process
  266. // any requests itself.
  267. [Fields.mainStat]: node.total ? (node.seconds! / node.total) * 1000 : Number.NaN, // Average response time
  268. [Fields.secondaryStat]: node.total ? Math.round((node.total / (rangeMs / 1000)) * 100) / 100 : Number.NaN, // Request per second (to 2 decimals)
  269. [Fields.arc + 'success']: node.total ? (node.total - Math.min(node.failed || 0, node.total)) / node.total : 1,
  270. [Fields.arc + 'failed']: node.total ? Math.min(node.failed || 0, node.total) / node.total : 0,
  271. });
  272. }
  273. for (const edgeId of Object.keys(edgesMap)) {
  274. const edge = edgesMap[edgeId];
  275. edges.add({
  276. [Fields.id]: edgeId,
  277. [Fields.source]: edge.source,
  278. [Fields.target]: edge.target,
  279. [Fields.mainStat]: edge.total, // Requests
  280. [Fields.secondaryStat]: edge.total ? (edge.seconds! / edge.total) * 1000 : Number.NaN, // Average response time
  281. });
  282. }
  283. return { nodes, edges };
  284. }