Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow disabling network and/or storage metric collection only #4420

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ additional details on each available environment variable.
| `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data`
| `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false |
| `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | false |
| `ECS_DISABLE_STORAGE_METRICS` | <true | false> | Whether to disable storage metrics gathering for tasks. | false | false |
| `ECS_DISABLE_NETWORK_METRICS` | <true | false> | Whether to disable network metrics gathering for tasks. | false | false |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. Setting this value to `true` can help reduce the CPU usage of dockerd and containerd on the ECS container instance. See also ECS_POLL_METRICS_WAIT_DURATION for setting the poll interval. | `false` | `false` |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. If user sets above maximum it will be set to max, and if below minimum it will be set to min. As the number of tasks/containers increase, a higher `ECS_POLLING_METRICS_WAIT_DURATION` value can potentially cause a problem where memory reservation value of ECS cluster reported in metrics becomes unstable due to missing metrics sample at metric collection time. It is recommended to keep this value smaller than 18s. This behavior is only observed on certain OS and platforms. | 10s | 10s |
| `ECS_PULL_DEPENDENT_CONTAINERS_UPFRONT` | <true | false> | Whether to pull images for containers with dependencies before the dependsOn condition has been satisfied. | false | false |
Expand Down
6 changes: 6 additions & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,8 @@ func environmentConfig() (Config, error) {
UpdatesEnabled: parseBooleanDefaultFalseConfig("ECS_UPDATES_ENABLED"),
UpdateDownloadDir: os.Getenv("ECS_UPDATE_DOWNLOAD_DIR"),
DisableMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_METRICS"),
DisableNetworkMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_NETWORK_METRICS"),
DisableStorageMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_STORAGE_METRICS"),
ReservedMemory: parseEnvVariableUint16("ECS_RESERVED_MEMORY"),
AvailableLoggingDrivers: parseAvailableLoggingDrivers(),
PrivilegedDisabled: parseBooleanDefaultFalseConfig("ECS_DISABLE_PRIVILEGED"),
Expand Down Expand Up @@ -626,6 +628,8 @@ func (cfg *Config) String() string {
"AuthType: %v, "+
"UpdatesEnabled: %v, "+
"DisableMetrics: %v, "+
"DisableNetworkMetrics: %v, "+
"DisableStorageMetrics: %v, "+
"PollMetrics: %v, "+
"PollingMetricsWaitDuration: %v, "+
"ReservedMem: %v, "+
Expand All @@ -646,6 +650,8 @@ func (cfg *Config) String() string {
cfg.EngineAuthType,
cfg.UpdatesEnabled,
cfg.DisableMetrics,
cfg.DisableNetworkMetrics,
cfg.DisableStorageMetrics,
cfg.PollMetrics,
cfg.PollingMetricsWaitDuration,
cfg.ReservedMemory,
Expand Down
4 changes: 4 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,14 @@ func TestConfigBoolean(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_DISABLE_DOCKER_HEALTH_CHECK", "true")()
defer setTestEnv("ECS_DISABLE_METRICS", "true")()
defer setTestEnv("ECS_DISABLE_NETWORK_METRICS", "true")()
defer setTestEnv("ECS_DISABLE_STORAGE_METRICS", "true")()
defer setTestEnv("ECS_ENABLE_SPOT_INSTANCE_DRAINING", "true")()
cfg, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.True(t, cfg.DisableMetrics.Enabled())
assert.True(t, cfg.DisableNetworkMetrics.Enabled())
assert.True(t, cfg.DisableStorageMetrics.Enabled())
assert.True(t, cfg.DisableDockerHealthCheck.Enabled())
assert.True(t, cfg.SpotInstanceDrainingEnabled.Enabled())
}
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ func DefaultConfig() Config {
DataDir: "/data/",
DataDirOnHost: "/var/lib/ecs",
DisableMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
DisableNetworkMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
DisableStorageMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
ReservedMemory: 0,
AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver},
TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration,
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, "unix:///var/run/docker.sock", cfg.DockerEndpoint, "Default docker endpoint set incorrectly")
assert.Equal(t, "/data/", cfg.DataDir, "Default datadir set incorrectly")
assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly")
assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly")
assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly")
assert.Equal(t, 5, len(cfg.ReservedPorts), "Default reserved ports set incorrectly")
assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly")
assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly")
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, "npipe:////./pipe/docker_engine", cfg.DockerEndpoint, "Default docker endpoint set incorrectly")
assert.Equal(t, `C:\ProgramData\Amazon\ECS\data`, cfg.DataDir, "Default datadir set incorrectly")
assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly")
assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly")
assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly")
assert.Equal(t, 11, len(cfg.ReservedPorts), "Default reserved ports set incorrectly")
assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly")
assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly")
Expand Down
8 changes: 8 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,14 @@ type Config struct {
// sent to the ECS telemetry endpoint
DisableMetrics BooleanDefaultFalse

// DisableNetworkMetrics configures whether task network IO utilization metrics should be
// sent to the ECS telemetry endpoint
DisableNetworkMetrics BooleanDefaultFalse

// DisableStorageMetrics configures whether task block IO utilization metrics should be
// sent to the ECS telemetry endpoint
DisableStorageMetrics BooleanDefaultFalse

// PollMetrics configures whether metrics are constantly streamed for each container or
// polled on interval instead.
PollMetrics BooleanDefaultFalse
Expand Down
102 changes: 53 additions & 49 deletions agent/stats/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -864,14 +864,16 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*
MemoryStatsSet: memoryStatsSet,
}

storageStatsSet, err := container.statsQueue.GetStorageStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting storage stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
containerMetric.StorageStatsSet = storageStatsSet
if !engine.config.DisableStorageMetrics.Enabled() {
storageStatsSet, err := container.statsQueue.GetStorageStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting storage stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
containerMetric.StorageStatsSet = storageStatsSet
}
}

restartStatsSet, err := container.statsQueue.GetRestartStatsSet()
Expand All @@ -887,51 +889,53 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*
containerMetric.RestartStatsSet = restartStatsSet
}

task, err := engine.resolver.ResolveTask(dockerID)
if err != nil {
logger.Warn("Task not found for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil {
logger.Warn("Could not map container ID to container, container", logger.Fields{
field.DockerId: dockerID,
field.Error: err,
if !engine.config.DisableNetworkMetrics.Enabled() {
task, err := engine.resolver.ResolveTask(dockerID)
if err != nil {
logger.Warn("Task not found for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
// send network stats for default/bridge/nat/awsvpc network modes
if task.IsNetworkModeBridge() {
if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause {
seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task")
} else {
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
// we log the error and still continue to publish cpu, memory stats
logger.Warn("Error getting network stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil {
logger.Warn("Could not map container ID to container, container", logger.Fields{
field.DockerId: dockerID,
field.Error: err,
})
} else {
// send network stats for default/bridge/nat/awsvpc network modes
if task.IsNetworkModeBridge() {
if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause {
seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task")
} else {
containerMetric.NetworkStatsSet = networkStatsSet
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
// we log the error and still continue to publish cpu, memory stats
logger.Warn("Error getting network stats for container", logger.Fields{
field.Container: dockerID,
field.Error: err,
})
} else {
containerMetric.NetworkStatsSet = networkStatsSet
}
}
}
} else if task.IsNetworkModeAWSVPC() {
taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn]
if !taskExistsInTaskStats {
return nil, fmt.Errorf("task not found")
}
// do not add network stats for pause container
if dockerContainer.Container.Type != apicontainer.ContainerCNIPause {
networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting network stats for container", logger.Fields{
field.TaskARN: taskArn,
field.Container: dockerContainer.DockerID,
field.Error: err,
})
} else {
containerMetric.NetworkStatsSet = networkStats
} else if task.IsNetworkModeAWSVPC() {
taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn]
if !taskExistsInTaskStats {
return nil, fmt.Errorf("task not found")
}
// do not add network stats for pause container
if dockerContainer.Container.Type != apicontainer.ContainerCNIPause {
networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet()
if err != nil && age > gracePeriod {
logger.Warn("Error getting network stats for container", logger.Fields{
field.TaskARN: taskArn,
field.Container: dockerContainer.DockerID,
field.Error: err,
})
} else {
containerMetric.NetworkStatsSet = networkStats
}
}
}
}
Expand Down
31 changes: 22 additions & 9 deletions agent/stats/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,20 +724,22 @@ func TestSynchronizeOnRestart(t *testing.T) {

func TestTaskNetworkStatsSet(t *testing.T) {
var networkModes = []struct {
ENIs []*ni.NetworkInterface
NetworkMode string
ServiceConnectEnabled bool
StatsEmpty bool
ENIs []*ni.NetworkInterface
NetworkMode string
ServiceConnectEnabled bool
NetworkMetricsDisabled bool
StatsEmpty bool
}{
{nil, DefaultNetworkMode, false, false},
{nil, DefaultNetworkMode, true, true},
{nil, DefaultNetworkMode, false, false, false},
{nil, DefaultNetworkMode, true, false, true},
{nil, DefaultNetworkMode, false, true, true},
}
for _, tc := range networkModes {
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.StatsEmpty)
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.NetworkMetricsDisabled, tc.StatsEmpty)
}
}

func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, emptyStats bool) {
func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, networkMetricsDisabled, emptyStats bool) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
resolver := mock_resolver.NewMockContainerMetadataResolver(mockCtrl)
Expand Down Expand Up @@ -782,7 +784,18 @@ func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterf
State: &types.ContainerState{Pid: 23},
},
}, nil).AnyTimes()
engine := NewDockerStatsEngine(&cfg, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil)

var c *config.Config
if networkMetricsDisabled {
c = &config.Config{
DisableNetworkMetrics: config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled},
}
c = c.Merge(cfg)
} else {
c = &cfg
}

engine := NewDockerStatsEngine(c, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil)
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
engine.ctx = ctx
Expand Down
18 changes: 10 additions & 8 deletions agent/stats/engine_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,19 @@ const (

func TestLinuxTaskNetworkStatsSet(t *testing.T) {
var networkModes = []struct {
ENIs []*ni.NetworkInterface
NetworkMode string
StatsEmpty bool
ENIs []*ni.NetworkInterface
NetworkMode string
NetworkMetricsDisabled bool
StatsEmpty bool
}{
{[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", true},
{nil, "host", true},
{nil, "bridge", false},
{nil, "none", true},
{[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", false, true},
{nil, "host", false, true},
{nil, "bridge", false, false},
{nil, "bridge", true, true},
{nil, "none", false, true},
}
for _, tc := range networkModes {
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.StatsEmpty)
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.NetworkMetricsDisabled, tc.StatsEmpty)
}
}

Expand Down