diff --git a/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json b/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json index 628acf1569..90bda3bfff 100644 --- a/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json +++ b/src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-AMD.json @@ -1,7 +1,7 @@ { "Description": "Default Monitors for AMD GPU systems.", "Metadata": { - "SupportedPlatforms": "linux-x64,win-x64", + "SupportedPlatforms": "linux-x64, win-x64", "SupportedOperatingSystems": "CBL-Mariner,CentOS,Debian,RedHat,Suse,Ubuntu,Windows" }, "Parameters": { @@ -12,9 +12,19 @@ { "Type": "AmdSmiMonitor", "Parameters": { - "Scenario": "AmdGpuCounters", - "MonitorFrequency": "$.Parameters.MonitorFrequency", - "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" + "Scenario": "AmdGpuCounters", + "Subsystem": "metric", + "MonitorFrequency": "$.Parameters.MonitorFrequency", + "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" + } + }, + { + "Type": "AmdSmiMonitor", + "Parameters": { + "Scenario": "AmdGpuCounters", + "Subsystem": "xgmi", + "MonitorFrequency": "$.Parameters.MonitorFrequency", + "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" } }, { diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiMetricQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiMetricQueryGpuParserUnitTests.cs new file mode 100644 index 0000000000..7dbb81bacc --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiMetricQueryGpuParserUnitTests.cs @@ -0,0 +1,59 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System.Collections.Generic; + using System.Diagnostics; + using System.IO; + using System.Linq; + using System.Reflection; + using System.Text; + using System.Threading.Tasks; + using NUnit.Framework; + using VirtualClient.Common; + using VirtualClient.Contracts; + + [TestFixture] + [Category("Unit")] + public class AmdSmiMetricQueryGpuParserUnitTests + { + [Test] + public void AmdSmiMetricQueryGpuParserParsesMetricsCorrectly() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "metric.csv"); + string rawText = File.ReadAllText(outputPath); + + AmdSmiMetricQueryGpuParser testParser = new AmdSmiMetricQueryGpuParser(rawText); + IList metrics = testParser.Parse(); + + MetricAssert.Exists(metrics, "utilization.gpu [%]", 98, "%"); + MetricAssert.Exists(metrics, "framebuffer.total [MB]", 14928, "MB"); + MetricAssert.Exists(metrics, "framebuffer.used [MB]", 363, "MB"); + + } + + [Test] + public void AmdSmiMetricQueryGpuParserParsesMetricsCorrectly_MI300X() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "metric-8xMI300X.csv"); + string rawText = File.ReadAllText(outputPath); + + AmdSmiMetricQueryGpuParser testParser = new AmdSmiMetricQueryGpuParser(rawText); + IList metrics = testParser.Parse(); + + MetricAssert.Exists(metrics, "utilization.gpu", 0, "%"); + MetricAssert.Exists(metrics, "utilization.memory", 0, "%"); + MetricAssert.Exists(metrics, "temperature.gpu", 36, "celsius"); + MetricAssert.Exists(metrics, "temperature.memory", 30, "celsius"); + MetricAssert.Exists(metrics, "power.draw.average", 133, "W"); + MetricAssert.Exists(metrics, "gfx_clk_avg", 132.125, "MHz"); + MetricAssert.Exists(metrics, "mem_clk", 900, "MHz"); + MetricAssert.Exists(metrics, "video_vclk_avg", 29, "MHz"); + MetricAssert.Exists(metrics, "video_dclk_avg", 22, "MHz"); + MetricAssert.Exists(metrics, "pcie_bw", 24, "MB/s"); + } + } +} \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs deleted file mode 100644 index bba3396138..0000000000 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiQueryGpuParserUnitTests.cs +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace VirtualClient.Monitors -{ - using System.Collections.Generic; - using System.Diagnostics; - using System.IO; - using System.Linq; - using System.Reflection; - using System.Text; - using System.Threading.Tasks; - using NUnit.Framework; - using VirtualClient.Common; - using VirtualClient.Contracts; - - [TestFixture] - [Category("Unit")] - public class AmdSmiQueryGpuParserUnitTests - { - [Test] - public void AmdSmiQueryGpuParserParsesMetricsCorrectly() - { - string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); - string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "result.txt"); - string rawText = File.ReadAllText(outputPath); - - AmdSmiQueryGpuParser testParser = new AmdSmiQueryGpuParser(rawText); - IList metrics = testParser.Parse(); - - Assert.AreEqual(3, metrics.Count); - MetricAssert.Exists(metrics, "utilization.gpu [%]", 98, "%"); - MetricAssert.Exists(metrics, "framebuffer.total [MB]", 14928, "MB"); - MetricAssert.Exists(metrics, "framebuffer.used [MB]", 363, "MB"); - } - } -} \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs new file mode 100644 index 0000000000..984d66413c --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/AmdSmiXGMIQueryGpuParserUnitTests.cs @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System.Collections.Generic; + using System.Diagnostics; + using System.IO; + using System.Linq; + using System.Reflection; + using System.Text; + using System.Threading.Tasks; + using NUnit.Framework; + using VirtualClient.Common; + using VirtualClient.Contracts; + + [TestFixture] + [Category("Unit")] + public class AmdSmiXGMIQueryGpuParserUnitTests + { + [Test] + public void AmdSmiXGMIQueryGpuParserParsesMetricsCorrectly() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "amd-smi", "xgmi-8xMI300X.json"); + string rawText = File.ReadAllText(outputPath); + + AmdSmiXGMIQueryGpuParser testParser = new AmdSmiXGMIQueryGpuParser(rawText); + IList metrics = testParser.Parse(); + + Assert.AreEqual(8, metrics.Count); + MetricAssert.Exists(metrics, "xgmi_0_data", 14, "KB"); + MetricAssert.Exists(metrics, "xgmi_1_data", 12, "KB"); + MetricAssert.Exists(metrics, "xgmi_2_data", 10, "KB"); + MetricAssert.Exists(metrics, "xgmi_3_data", 9, "KB"); + MetricAssert.Exists(metrics, "xgmi_4_data", 9, "KB"); + MetricAssert.Exists(metrics, "xgmi_5_data", 8, "KB"); + MetricAssert.Exists(metrics, "xgmi_6_data", 6, "KB"); + MetricAssert.Exists(metrics, "xgmi_7_data", 6, "KB"); + } + } +} \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv new file mode 100644 index 0000000000..fd48ca0183 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric-8xMI300X.csv @@ -0,0 +1,9 @@ +gpu,gfx_activity,umc_activity,mm_activity,vcn_activity,jpeg_activity,socket_power,gfx_voltage,soc_voltage,mem_voltage,power_management,throttle_status,gfx_0_clk,gfx_0_min_clk,gfx_0_max_clk,gfx_0_clk_locked,gfx_0_deep_sleep,gfx_1_clk,gfx_1_min_clk,gfx_1_max_clk,gfx_1_clk_locked,gfx_1_deep_sleep,gfx_2_clk,gfx_2_min_clk,gfx_2_max_clk,gfx_2_clk_locked,gfx_2_deep_sleep,gfx_3_clk,gfx_3_min_clk,gfx_3_max_clk,gfx_3_clk_locked,gfx_3_deep_sleep,gfx_4_clk,gfx_4_min_clk,gfx_4_max_clk,gfx_4_clk_locked,gfx_4_deep_sleep,gfx_5_clk,gfx_5_min_clk,gfx_5_max_clk,gfx_5_clk_locked,gfx_5_deep_sleep,gfx_6_clk,gfx_6_min_clk,gfx_6_max_clk,gfx_6_clk_locked,gfx_6_deep_sleep,gfx_7_clk,gfx_7_min_clk,gfx_7_max_clk,gfx_7_clk_locked,gfx_7_deep_sleep,mem_0_clk,mem_0_min_clk,mem_0_max_clk,mem_0_clk_locked,mem_0_deep_sleep,vclk_0_clk,vclk_0_min_clk,vclk_0_max_clk,vclk_0_clk_locked,vclk_0_deep_sleep,vclk_1_clk,vclk_1_min_clk,vclk_1_max_clk,vclk_1_clk_locked,vclk_1_deep_sleep,vclk_2_clk,vclk_2_min_clk,vclk_2_max_clk,vclk_2_clk_locked,vclk_2_deep_sleep,vclk_3_clk,vclk_3_min_clk,vclk_3_max_clk,vclk_3_clk_locked,vclk_3_deep_sleep,dclk_0_clk,dclk_0_min_clk,dclk_0_max_clk,dclk_0_clk_locked,dclk_0_deep_sleep,dclk_1_clk,dclk_1_min_clk,dclk_1_max_clk,dclk_1_clk_locked,dclk_1_deep_sleep,dclk_2_clk,dclk_2_min_clk,dclk_2_max_clk,dclk_2_clk_locked,dclk_2_deep_sleep,dclk_3_clk,dclk_3_min_clk,dclk_3_max_clk,dclk_3_clk_locked,dclk_3_deep_sleep,edge,hotspot,mem,width,speed,bandwidth,replay_count,l0_to_recovery_count,replay_roll_over_count,nak_sent_count,nak_received_count,current_bandwidth_sent,current_bandwidth_received,max_packet_size,total_correctable_count,total_uncorrectable_count,total_deferred_count,cache_correctable_count,cache_uncorrectable_count,UMC_correctable_count,UMC_uncorrectable_count,UMC_deferred_count,SDMA_correctable_count,SDMA_uncorrectable_count,SDMA_deferred_count,GFX_correctable_count,GFX_uncorrectable_count,GFX_deferred_count,MMHUB_correctable_count,MMHUB_uncorrectable_count,MMHUB_deferred_count,PCIE_BIF_correctable_count,PCIE_BIF_uncorrectable_count,PCIE_BIF_deferred_count,HDP_correctable_count,HDP_uncorrectable_count,HDP_deferred_count,XGMI_WAFL_correctable_count,XGMI_WAFL_uncorrectable_count,XGMI_WAFL_deferred_count,max,rpm,usage,point_0_frequency,point_0_voltage,point_1_frequency,point_1_voltage,point_2_frequency,point_2_voltage,overdrive,perf_level,xgmi_err,total_energy_consumption,total_vram,used_vram,free_vram,total_visible_vram,used_visible_vram,free_visible_vram,total_gtt,used_gtt,free_gtt +0,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",133,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,30,16,N/A,192,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12119300.381,196592,283,196309,196592,283,196309,1031932,20,1031912 +1,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",139,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,35,29,16,N/A,157,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12648636.191,196592,283,196309,196592,283,196309,1031932,20,1031912 +2,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",133,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,32,29,16,N/A,106,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12142639.892,196592,283,196309,196592,283,196309,1031932,20,1031912 +3,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,28,16,N/A,192,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12086086.983,196592,283,196309,196592,283,196309,1031932,20,1031912 +4,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",136,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,133,500,2100,DISABLED,ENABLED,134,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,29,16,N/A,145,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12403534.5,196592,283,196309,196592,283,196309,1031932,20,1031912 +5,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,35,29,16,N/A,107,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12011883.234,196592,283,196309,196592,283,196309,1031932,20,1031912 +6,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",132,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,131,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,36,29,16,N/A,94,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,11987029.516,196592,283,196309,196592,283,196309,1031932,20,1031912 +7,0,0,N/A,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",134,N/A,N/A,N/A,ENABLED,UNTHROTTLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,132,500,2100,DISABLED,ENABLED,900,900,1300,N/A,DISABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,29,914,1333,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,22,711,1143,N/A,ENABLED,N/A,37,31,16,N/A,90,0,1,0,0,0,N/A,N/A,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,AMDSMI_DEV_PERF_LEVEL_AUTO,N/A,12193331.537,196592,283,196309,196592,283,196309,1031932,20,1031912 \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv new file mode 100644 index 0000000000..bf9329164a --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/metric.csv @@ -0,0 +1,2 @@ +gpu,gfx_usage,mem_usage,mm_usage_list,fb_total,fb_used,gfx_cur_clk,mem_cur_clk,mm1_cur_clk,mm2_cur_clk +0,98,1,[0, 0],14928,363,N/A,N/A,N/A,N/A \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/result.txt b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/result.txt deleted file mode 100644 index 73551c460e..0000000000 Binary files a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/result.txt and /dev/null differ diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/xgmi-8xMI300X.json b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/xgmi-8xMI300X.json new file mode 100644 index 0000000000..79ded80a90 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/amd-smi/xgmi-8xMI300X.json @@ -0,0 +1,858 @@ +[ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 1, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": "N/A", + "write": "N/A" + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + } + ] + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "link_metrics": { + "bit_rate": { + "value": 32, + "unit": "Gb/s" + }, + "max_bandwidth": { + "value": 512, + "unit": "Gb/s" + }, + "link_type": "XGMI", + "links": [ + { + "gpu": 0, + "bdf": "0000:0c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 0, + "unit": "KB" + } + }, + { + "gpu": 1, + "bdf": "0000:22:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 2, + "bdf": "0000:38:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 3, + "bdf": "0000:5c:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 4, + "bdf": "0000:9f:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 5, + "bdf": "0000:af:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 6, + "bdf": "0000:bf:00.0", + "read": { + "value": 0, + "unit": "KB" + }, + "write": { + "value": 1, + "unit": "KB" + } + }, + { + "gpu": 7, + "bdf": "0000:df:00.0", + "read": "N/A", + "write": "N/A" + } + ] + } + } +] \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs new file mode 100644 index 0000000000..7d477f2da9 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMetricQueryGpuParser.cs @@ -0,0 +1,94 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System; + using System.Collections.Generic; + using System.Data; + using System.Linq; + using System.Text.RegularExpressions; + using VirtualClient.Contracts; + using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; + + /// + /// Parser for AmdSmi output document. + /// + public class AmdSmiMetricQueryGpuParser : MetricsParser + { + /// + /// Constructor for + /// + /// Raw text to parse. + public AmdSmiMetricQueryGpuParser(string rawText) + : base(rawText) + { + } + + /// + public override IList Parse() + { + this.Preprocess(); + + List metrics = new List(); + DataTable dataTable = DataTableExtensions.DataTableFromCsv(this.PreprocessedText); + + foreach (DataRow row in dataTable.Rows) + { + Dictionary metadata = new Dictionary() + { + { "gpu.id", Convert.ToString(SafeGet(row, "gpu")) } + }; + + metrics.Add(new Metric("utilization.gpu [%]", Convert.ToDouble(SafeGet(row, "gfx_usage")), unit: "%", metadata: metadata)); + metrics.Add(new Metric("framebuffer.total [MB]", Convert.ToDouble(SafeGet(row, "fb_total")), unit: "MB", metadata: metadata)); + metrics.Add(new Metric("framebuffer.used [MB]", Convert.ToDouble(SafeGet(row, "fb_used")), unit: "MB", metadata: metadata)); + + // AMD MI300X + metrics.Add(new Metric("utilization.gpu", Convert.ToDouble(SafeGet(row, "gfx_activity")), unit: "%", metadata: metadata)); + double value = 100 * Convert.ToDouble(SafeGet(row, "used_vram")) / Convert.ToDouble(SafeGet(row, "total_vram")); + int roundedValue = Convert.ToInt32(Math.Round(value)); + metrics.Add(new Metric("utilization.memory", roundedValue, unit: "%", metadata: metadata)); + metrics.Add(new Metric("temperature.gpu", Convert.ToDouble(SafeGet(row, "hotspot")), unit: "celsius", metadata: metadata)); + metrics.Add(new Metric("temperature.memory", Convert.ToDouble(SafeGet(row, "mem")), unit: "celsius", metadata: metadata)); + metrics.Add(new Metric("power.draw.average", Convert.ToDouble(SafeGet(row, "socket_power")), unit: "W", metadata: metadata)); + + double gfx_clk_avg = (Convert.ToDouble(SafeGet(row, "gfx_0_clk")) + Convert.ToDouble(SafeGet(row, "gfx_1_clk")) + + Convert.ToDouble(SafeGet(row, "gfx_2_clk")) + Convert.ToDouble(SafeGet(row, "gfx_3_clk")) + + Convert.ToDouble(SafeGet(row, "gfx_4_clk")) + Convert.ToDouble(SafeGet(row, "gfx_5_clk")) + + Convert.ToDouble(SafeGet(row, "gfx_6_clk")) + Convert.ToDouble(SafeGet(row, "gfx_7_clk"))) / 8; + + metrics.Add(new Metric("gfx_clk_avg", gfx_clk_avg, unit: "MHz", metadata: metadata)); + metrics.Add(new Metric("mem_clk", Convert.ToDouble(SafeGet(row, "mem_0_clk")), unit: "MHz", metadata: metadata)); + + double video_vclk_avg = (Convert.ToDouble(SafeGet(row, "vclk_0_clk")) + Convert.ToDouble(SafeGet(row, "vclk_1_clk")) + + Convert.ToDouble(SafeGet(row, "vclk_2_clk")) + Convert.ToDouble(SafeGet(row, "vclk_3_clk"))) / 4; + + metrics.Add(new Metric("video_vclk_avg", video_vclk_avg, unit: "MHz", metadata: metadata)); + + double video_dclk_avg = (Convert.ToDouble(SafeGet(row, "dclk_0_clk")) + Convert.ToDouble(SafeGet(row, "dclk_1_clk")) + + Convert.ToDouble(SafeGet(row, "dclk_2_clk")) + Convert.ToDouble(SafeGet(row, "dclk_3_clk"))) / 4; + + metrics.Add(new Metric("video_dclk_avg", video_dclk_avg, unit: "MHz", metadata: metadata)); + metrics.Add(new Metric("pcie_bw", Convert.ToDouble(SafeGet(row, "bandwidth")) / 8, unit: "MB/s", metadata: metadata)); + } + + return metrics; + } + + /// + protected override void Preprocess() + { + this.PreprocessedText = this.RawText.Replace("\r\n", Environment.NewLine); + Regex quotedPattern = new Regex("\"([^\"]*)\""); + this.PreprocessedText = quotedPattern.Replace(this.PreprocessedText, "N/A"); + Regex quotedPattern2 = new Regex("\\[.*?\\]"); + this.PreprocessedText = quotedPattern2.Replace(this.PreprocessedText, "N/A"); + } + + private static IConvertible SafeGet(DataRow row, string columnName) + { + return row.Table.Columns.Contains(columnName) ? Convert.ToString(row[columnName]) : "-1"; + } + } +} diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs index da9ac6666c..5728b2ee5f 100644 --- a/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs +++ b/src/VirtualClient/VirtualClient.Monitors/AmdSmiMonitor.cs @@ -5,14 +5,17 @@ namespace VirtualClient.Monitors { using System; using System.Collections.Generic; + using System.Diagnostics; using System.IO.Abstractions; using System.Linq; using System.Threading; using System.Threading.Tasks; using global::VirtualClient; using global::VirtualClient.Contracts; + using Microsoft.CodeAnalysis; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; + using Utilities; using VirtualClient.Common; using VirtualClient.Common.Extensions; using VirtualClient.Common.Telemetry; @@ -22,27 +25,52 @@ namespace VirtualClient.Monitors /// public class AmdSmiMonitor : VirtualClientIntervalBasedMonitor { + /// + /// Name of Metric subsystem. + /// + protected const string Metric = "metric"; + + /// + /// Name of XGMI subsystem. + /// + protected const string XGMI = "xgmi"; + + private ISystemManagement systemManagement; + private IFileSystem fileSystem; + /// /// Initializes a new instance of the class. /// public AmdSmiMonitor(IServiceCollection dependencies, IDictionary parameters) : base(dependencies, parameters) { + this.systemManagement = this.Dependencies.GetService(); + this.fileSystem = this.systemManagement.FileSystem; + } + + /// + /// AMDSMI Subsystem Name. + /// + public string Subsystem + { + get + { + this.Parameters.TryGetValue(nameof(AmdSmiMonitor.Subsystem), out IConvertible subsystem); + return subsystem?.ToString(); + } } /// protected override async Task ExecuteAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - switch (this.Platform) + if (this.Subsystem == AmdSmiMonitor.Metric) { - case PlatformID.Win32NT: - await this.QueryGpuAsync(telemetryContext, cancellationToken) - .ConfigureAwait(false); - break; + await this.QueryGpuMetricAsync(telemetryContext, cancellationToken).ConfigureAwait(false); + } - case PlatformID.Unix: - // not supported at the moment - break; + if (this.Subsystem == AmdSmiMonitor.XGMI) + { + await this.QueryGpuXGMIAsync(telemetryContext, cancellationToken).ConfigureAwait(false); } } @@ -58,19 +86,54 @@ protected void ValidateParameters() } } + private string GetAmdSmiCommand() + { + string command = string.Empty; + switch (this.Platform) + { + case PlatformID.Win32NT: + command = "amdsmi"; + break; + + case PlatformID.Unix: + command = "amd-smi"; + break; + } + + return command; + } + + private IList AmdSmiXGMIBandwidthAggregator(IList metrics1, IList metrics2, long time) + { + List aggregatedMetrics = new List(); + + if (metrics1.Any() && metrics2.Any()) + { + foreach (Metric counter1 in metrics1) + { + foreach (Metric counter2 in metrics2) + { + if (counter1.Metadata["gpu.id"] == counter2.Metadata["gpu.id"]) + { + double bandwidth = (counter2.Value - counter1.Value) / (((double)time) / 1000.0); + aggregatedMetrics.Add(new Metric($"xgmi.bw", (bandwidth / 1024), unit: "MB/s", metadata: counter1.Metadata)); + } + } + } + } + + return aggregatedMetrics; + } + /// /// Query the gpu for utilization information /// /// Provides context information that will be captured with telemetry events. /// A token that can be used to cancel the operation. /// - private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToken cancellationToken) + private async Task QueryGpuMetricAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - ISystemManagement systemManagement = this.Dependencies.GetService(); - IFileSystem fileSystem = systemManagement.FileSystem; - int totalSamples = (int)this.MonitorFrequency.TotalSeconds; - string command = "amdsmi"; string commandArguments = "metric --csv"; await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) @@ -80,7 +143,7 @@ await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) { try { - using (IProcessProxy process = systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, command, $"{commandArguments}", Environment.CurrentDirectory)) + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) { this.CleanupTasks.Add(() => process.SafeKill()); @@ -99,7 +162,7 @@ await process.StartAndWaitAsync(cancellationToken) if (process.StandardOutput.Length > 0) { - AmdSmiQueryGpuParser parser = new AmdSmiQueryGpuParser(process.StandardOutput.ToString()); + AmdSmiMetricQueryGpuParser parser = new AmdSmiMetricQueryGpuParser(process.StandardOutput.ToString()); IList metrics = parser.Parse(); if (metrics?.Any() == true) @@ -128,5 +191,76 @@ await process.StartAndWaitAsync(cancellationToken) } } } + + private async Task QueryGpuXGMIAsync(EventContext telemetryContext, CancellationToken cancellationToken) + { + int totalSamples = (int)this.MonitorFrequency.TotalSeconds; + string commandArguments = "xgmi -m --json"; + DateTime startTime1, endTime1, startTime2, endTime2; + IList metrics1, metrics2, aggregatedMetrics; + + Stopwatch stopwatch; + long elapsedMilliseconds; + + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) + .ConfigureAwait(false); + + while (!cancellationToken.IsCancellationRequested) + { + try + { + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) + { + this.CleanupTasks.Add(() => process.SafeKill()); + + stopwatch = Stopwatch.StartNew(); + + startTime1 = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken) + .ConfigureAwait(false); + + endTime1 = DateTime.UtcNow; + + AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); + metrics1 = parser.Parse(); + } + + await Task.Delay(500).ConfigureAwait(false); + + using (IProcessProxy process = this.systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, this.GetAmdSmiCommand(), $"{commandArguments}", Environment.CurrentDirectory)) + { + this.CleanupTasks.Add(() => process.SafeKill()); + + startTime2 = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken) + .ConfigureAwait(false); + + endTime2 = DateTime.UtcNow; + stopwatch.Stop(); + elapsedMilliseconds = stopwatch.ElapsedMilliseconds; + + AmdSmiXGMIQueryGpuParser parser = new AmdSmiXGMIQueryGpuParser(process.StandardOutput.ToString()); + metrics2 = parser.Parse(); + } + + aggregatedMetrics = this.AmdSmiXGMIBandwidthAggregator(metrics1, metrics2, time: elapsedMilliseconds); + + if (aggregatedMetrics?.Any() == true) + { + this.Logger.LogPerformanceCounters("amd", aggregatedMetrics, startTime1, endTime2, telemetryContext); + } + + await Task.Delay(this.MonitorFrequency).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + // Expected whenever ctrl-C is used. + } + catch (Exception exc) + { + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); + } + } + } } -} \ No newline at end of file +} diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiQueryGpuParser.cs deleted file mode 100644 index 24220e17b7..0000000000 --- a/src/VirtualClient/VirtualClient.Monitors/AmdSmiQueryGpuParser.cs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace VirtualClient.Monitors -{ - using System; - using System.Collections.Generic; - using System.Data; - using System.Linq; - using VirtualClient.Contracts; - using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; - - /// - /// Parser for AmdSmi output document. - /// - public class AmdSmiQueryGpuParser : MetricsParser - { - /// - /// Constructor for - /// - /// Raw text to parse. - public AmdSmiQueryGpuParser(string rawText) - : base(rawText) - { - } - - /// - public override IList Parse() - { - this.Preprocess(); - - // Sanatize non-standard csv tokens in output - string replacedText = this.PreprocessedText.Replace("[0, 0]", "0"); - - List metrics = new List(); - DataTable dataTable = DataTableExtensions.DataTableFromCsv(replacedText); - - foreach (DataRow row in dataTable.Rows) - { - Dictionary metadata = new Dictionary() - { - { "gpu.id", Convert.ToString(row[0]) }, - }; - - // Ingest only the metrics which are exposed at the guest level - metrics.Add(new Metric("utilization.gpu [%]", Convert.ToDouble(row[1]), unit: "%", metadata: metadata)); - metrics.Add(new Metric("framebuffer.total [MB]", Convert.ToDouble(row[4]), unit: "MB", metadata: metadata)); - metrics.Add(new Metric("framebuffer.used [MB]", Convert.ToDouble(row[5]), unit: "MB", metadata: metadata)); - } - - return metrics; - } - - /// - protected override void Preprocess() - { - this.PreprocessedText = this.RawText.Replace("\r\n", Environment.NewLine); - } - } -} diff --git a/src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs b/src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs new file mode 100644 index 0000000000..74b69065ee --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors/AmdSmiXGMIQueryGpuParser.cs @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using System; + using System.Collections.Generic; + using System.Data; + using System.IO; + using System.Linq; + using System.Text; + using System.Text.RegularExpressions; + using System.Threading; + using Newtonsoft.Json; + using VirtualClient.Contracts; + using DataTableExtensions = VirtualClient.Contracts.DataTableExtensions; + + /// + /// Parser for AmdSmi output document. + /// + public class AmdSmiXGMIQueryGpuParser : MetricsParser + { + /// + /// Constructor for + /// + /// Raw text to parse. + public AmdSmiXGMIQueryGpuParser(string rawText) + : base(rawText) + { + } + + /// + public override IList Parse() + { + this.Preprocess(); + + List metrics = new List(); + List gpuDataList = JsonConvert.DeserializeObject>(this.PreprocessedText); + + DataTable dt = new DataTable(); + dt.Columns.Add("gpu", typeof(int)); + + int numGPUs = gpuDataList.Count; + for (int i = 0; i < numGPUs; i++) + { + dt.Columns.Add($"xgmi_{i}_data", typeof(double)); + } + + int id = 0; + foreach (dynamic gpuData in gpuDataList) + { + double data = 0; + DataRow row = dt.NewRow(); + row["gpu"] = gpuData.gpu; + foreach (var link in gpuData.link_metrics.links) + { + data += (link.read.value.Value + link.write.value.Value); + } + + row[$"xgmi_{id}_data"] = data; + dt.Rows.Add(row); + id++; + } + + int gpuId = 0; + foreach (DataRow row in dt.Rows) + { + Dictionary metadata = new Dictionary() + { + { "gpu.id", Convert.ToString(SafeGet(row, "gpu")) }, + }; + + metrics.Add(new Metric($"xgmi_{gpuId}_data", Convert.ToDouble(SafeGet(row, $"xgmi_{gpuId}_data")), unit: "KB", metadata: metadata)); + gpuId++; + } + + return metrics; + } + + /// + protected override void Preprocess() + { + Regex quotedPattern = new Regex("\"N/A\""); + this.PreprocessedText = quotedPattern.Replace(this.RawText, "{\r\n\"value\": 0,\r\n\"unit\": \"KB\"\r\n}"); + } + + private static IConvertible SafeGet(DataRow row, string columnName) + { + return row.Table.Columns.Contains(columnName) ? Convert.ToString(row[columnName]) : "-1"; + } + } +} diff --git a/website/docs/monitors/0200-monitor-profiles.md b/website/docs/monitors/0200-monitor-profiles.md index d3214e1334..cdfd0f87aa 100644 --- a/website/docs/monitors/0200-monitor-profiles.md +++ b/website/docs/monitors/0200-monitor-profiles.md @@ -114,3 +114,46 @@ The monitor profile designed for Nvidia GPU systems. The profile captures counte ./VirtualClient --profile=PERF-GPU-MLPERF.json --profile=MONITORS-GPU-NVIDIA.json --system=Demo --timeout=1440 --packageStore="{BlobConnectionString|SAS Uri}" ``` + + +## MONITORS-GPU-AMD.json +The monitor profile designed for AMD GPU systems. The profile captures metrics on systems of AMD GPUs with amd-smi. + +* **Supported Platform/Architectures** + * linux-x64 + * win-x64 + +* **Supported Operating Systems** + * Ubuntu 18 + * Ubuntu 20 + * Ubuntu 22 + +* **Dependencies** + * The system needs to have AMD GPU with ROCM installed. + +* **Scenarios** + * Captures metrics on systems using [amd-smi](./0500-amd-smi.md) + +* **Profile Parameters** + The following parameters can be optionally supplied on the command line to change this default behavior. + + | Parameter | Purpose | Default value | + |---------------------------|---------------------------------------------------------------------------------|---------------| + | Scenario | Optional. A description of the purpose of the monitor within the overall profile workflow. | | + | MonitorFrequency | Optional. Defines the frequency (timespan) at which performance counters will be captured/emitted (e.g. 00:01:00). | 00:05:00 | + | MonitorWarmupPeriod | Optional. Defines a period of time (timespan) to wait before starting to track/capture performance counters (e.g. 00:03:00). This allows the system to get to a more typical operational state and generally results better representation for the counters captured. | 00:05:00 | + | MetricFilter | Optional. A comma-delimited list of performance counter names to capture. The default behavior is to capture/emit all performance counters (e.g. \Processor Information(_Total)\% System Time,\Processor Information(_Total)\% User Time). This allows the profile author to focus on a smaller/specific subset of the counters. This is typically used when a lower monitor frequency is required for higher sample precision to keep the size of the data sets emitted by the Virtual Client to a minimum. | | + +* **Usage Examples** + The following section provides a few basic examples of how to use the monitor profile. Additional usage examples can be found in the + 'Usage Scenarios/Examples' link at the top. + + ``` bash + # Run the monitoring facilities only. + ./VirtualClient --profile=MONITORS-GPU-AMD.json + + # Monitor profile explicitly defined. + ./VirtualClient --profile=PERF-GPU-3DMARK-AMD.json --profile=MONITORS-GPU-AMD.json --system=Demo --timeout=1440 --packageStore="{BlobConnectionString|SAS Uri}" + + ``` + diff --git a/website/docs/monitors/0500-amd-smi.md b/website/docs/monitors/0500-amd-smi.md new file mode 100644 index 0000000000..3fd9320b57 --- /dev/null +++ b/website/docs/monitors/0500-amd-smi.md @@ -0,0 +1,81 @@ +# AMD SMI +The AMD System Management Interface (SMI) Library, or AMD SMI library, is a C library for Linux that provides a user space interface for applications to monitor and control AMD devices. + +AMD SMI library supports Linux bare metal and Linux virtual machine guest for AMD GPUs. +AMD SMI library can run on AMD ROCm supported platforms, refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for more information. + +To run the AMD SMI library, the amdgpu driver and the hsmp driver needs to be installed. + +In command line, it can report information as CSV, JSON or human readable plain text to either standard output or a file. For more details, please refer to the amd-smi documentation. + +* [AMD SMI Documentation](https://rocmdocs.amd.com/projects/amdsmi/en/latest/index.html) +* [AMD SMI CLI](https://rocmdocs.amd.com/projects/amdsmi/en/latest/how-to/using-AMD-SMI-CLI-tool.html) +* [AMD SMI Github Repo](https://github.com/ROCm/amdsmi) + +## Dependency +This monitor has dependency on amd-smi. Please use [AMD Driver Installation] dependency first to make sure amd-smi is present on the system. + +## Supported Platforms +* linux-x64 +* win-x64 + +## Supported Query +The 2 subcommands supported are metric and xgmi. Please create a feature request if you need more subcommands or metrics parsed. + +## amd-smi Output Description +The following section describes the various metrics that are available with amd-smi. + +| Metric Name | Description | +|-------------|-------------| +| utilization.gpu | GPU Utilization in Percentage | +| utilization.memory | GPU Memory Utilization in Percentage | +| temperature.gpu | GPU temperature in Celsius | +| temperature.memory | GPU memory temperature in Celsius | +| power.draw.average | GPU Power Drawn in Watts | +| gfx_clk_avg | Averaged GPU Graphics Clock in MHz | +| mem_clk | GPU Memory Clock in MHz | +| video_vclk_avg | Averaged GPU Video VCLK Clock in MHz | +| video_dclk_avg | Averaged GPU Video DCLK Clock in MHz | +| pcie_bw | Current Bidirectional Bandwidth of PCIe Link of CPU to OAM in MB/s | +| xgmi.bw | Current Total Bidirectional Bandwidth of 7 XGMI Links of OAM in MB/s | +| framebuffer.total | Total Frame Buffer in MB | +| framebuffer.used | Used Frame Buffer in MB | + +### Example +This is an example of the minimum profile to run AmdSmiMonitor. + +Remove the monitor with xgmi subsystem if GPU topology does not include xgmi links. + +```json +{ + "Description": "AMD SMI Monitor for AMD GPU systems.", + "Metadata": { + "SupportedPlatforms": "linux-x64,win-x64", + "SupportedOperatingSystems": "CBL-Mariner,CentOS,Debian,RedHat,Suse,Ubuntu,Windows" + }, + "Parameters": { + "MonitorFrequency": "00:01:00", + "MonitorWarmupPeriod": "00:01:00" + }, + "Monitors": [ + { + "Type": "AmdSmiMonitor", + "Parameters": { + "Scenario": "AmdGpuCounters", + "Subsystem": "metric", + "MonitorFrequency": "$.Parameters.MonitorFrequency", + "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" + } + }, + { + "Type": "AmdSmiMonitor", + "Parameters": { + "Scenario": "AmdGpuCounters", + "Subsystem": "xgmi", + "MonitorFrequency": "$.Parameters.MonitorFrequency", + "MonitorWarmupPeriod": "$.Parameters.MonitorWarmupPeriod" + } + } + ] +} +``` \ No newline at end of file