diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ed3a3255765..38d58021a11 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -223,6 +223,16 @@ steps: provider: "gcp" machineType: "n1-standard-8" + - label: "Extended runtime leak tests" + key: "extended-integration-tests" + command: ".buildkite/scripts/steps/integration_tests.sh stateful integration:TestForResourceLeaks" + artifact_paths: + - "build/TEST-**" + - "build/diagnostics/*" + agents: + provider: "gcp" + machineType: "n1-standard-8" + - label: "Integration tests" key: "integration-tests" command: ".buildkite/scripts/steps/integration_tests.sh stateful" diff --git a/NOTICE.txt b/NOTICE.txt index 5f45da2e7ec..c19c7e958e0 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -5823,6 +5823,36 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +-------------------------------------------------------------------------------- +Dependency : github.com/sajari/regression +Version: v1.0.1 +Licence type (autodetected): MIT +-------------------------------------------------------------------------------- + +Contents of probable licence file $GOMODCACHE/github.com/sajari/regression@v1.0.1/LICENSE: + +The MIT License (MIT) + +Copyright (c) 2014 Sajari Pty Ltd + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + -------------------------------------------------------------------------------- Dependency : github.com/schollz/progressbar/v3 Version: v3.13.1 diff --git a/dev-tools/mage/pkg.go b/dev-tools/mage/pkg.go index 6bf9238f0d3..e00bbbdb661 100644 --- a/dev-tools/mage/pkg.go +++ b/dev-tools/mage/pkg.go @@ -230,7 +230,7 @@ func TestPackages(options ...TestPackagesOption) error { if mg.Verbose() { fmt.Println(out) } - return err + return fmt.Errorf("error running package_test.go: %w, stdout: %s", err, out) } return nil diff --git a/docs/test-framework-dev-guide.md b/docs/test-framework-dev-guide.md index 6fc8550d509..45e6711e3ae 100644 --- a/docs/test-framework-dev-guide.md +++ b/docs/test-framework-dev-guide.md @@ -117,6 +117,11 @@ We pass a `-test.count` flag along with the name match We pass a `-test.run` flag along with the names of the tests we want to run in OR `GOTEST_FLAGS="-test.run ^(TestStandaloneUpgrade|TestFleetManagedUpgrade)$" mage integration:test` +##### Run Extended Runtime Leak Test +The test framework includes a "long running" test to check for resource leaks and stability. +The runtime of the test can be set via the `LONG_TEST_RUNTIME` environment variable. +The test itself can be run via the `integration:TestLongRunningAgentForLeaks` mage target. + ##### Limitations Due to the way the parameters are passed to `devtools.GoTest` the value of the environment variable is split on space, so not all combination of flags and their values may be correctly split. diff --git a/go.mod b/go.mod index b99e5f0f72e..878d5f90035 100644 --- a/go.mod +++ b/go.mod @@ -50,6 +50,7 @@ require ( github.com/pierrre/gotestcover v0.0.0-20160517101806-924dca7d15f0 github.com/pkg/errors v0.9.1 github.com/rs/zerolog v1.27.0 + github.com/sajari/regression v1.0.1 github.com/schollz/progressbar/v3 v3.13.1 github.com/shirou/gopsutil/v3 v3.24.1 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index dd6062b1f09..c2d15ee7a35 100644 --- a/go.sum +++ b/go.sum @@ -1700,6 +1700,8 @@ github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFo github.com/safchain/ethtool v0.0.0-20190326074333-42ed695e3de8/go.mod h1:Z0q5wiBQGYcxhMZ6gUqHn6pYNLypFAvaL3UvgZLR0U4= github.com/safchain/ethtool v0.0.0-20210803160452-9aa261dae9b1/go.mod h1:Z0q5wiBQGYcxhMZ6gUqHn6pYNLypFAvaL3UvgZLR0U4= github.com/sagikazarmark/crypt v0.9.0/go.mod h1:RnH7sEhxfdnPm1z+XMgSLjWTEIjyK4z2dw6+4vHTMuo= +github.com/sajari/regression v1.0.1 h1:iTVc6ZACGCkoXC+8NdqH5tIreslDTT/bXxT6OmHR5PE= +github.com/sajari/regression v1.0.1/go.mod h1:NeG/XTW1lYfGY7YV/Z0nYDV/RGh3wxwd1yW46835flM= github.com/santhosh-tekuri/jsonschema v1.2.4 h1:hNhW8e7t+H1vgY+1QeEQpveR6D4+OwKPXCfD2aieJis= github.com/santhosh-tekuri/jsonschema v1.2.4/go.mod h1:TEAUOeZSmIxTTuHatJzrvARHiuO9LYd+cIxzgEHCQI4= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= diff --git a/internal/pkg/agent/application/monitoring/v1_monitor.go b/internal/pkg/agent/application/monitoring/v1_monitor.go index 485595eda19..48b5a16a820 100644 --- a/internal/pkg/agent/application/monitoring/v1_monitor.go +++ b/internal/pkg/agent/application/monitoring/v1_monitor.go @@ -15,6 +15,7 @@ import ( "time" "unicode" + "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/utils" @@ -606,6 +607,10 @@ func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentI }, }, } + dbgLog := logp.L() + for _, comp := range componentList { + dbgLog.Infof("component: %#v\n\t componentData: %#v", comp, comp.Component.String()) + } for unit, binaryName := range componentIDToBinary { if !isSupportedMetricsBinary(binaryName) { continue diff --git a/magefile.go b/magefile.go index 08a23fae72a..b115b99e523 100644 --- a/magefile.go +++ b/magefile.go @@ -1987,6 +1987,14 @@ func (Integration) TestBeatServerless(ctx context.Context, beatname string) erro return integRunner(ctx, false, "TestBeatsServerless") } +func (Integration) TestForResourceLeaks(ctx context.Context) error { + err := os.Setenv("TEST_LONG_RUNNING", "true") + if err != nil { + return fmt.Errorf("error setting TEST_LONG_RUNNING: %w", err) + } + return integRunner(ctx, false, "TestLongRunningAgentForLeaks") +} + // TestOnRemote shouldn't be called locally (called on remote host to perform testing) func (Integration) TestOnRemote(ctx context.Context) error { mg.Deps(Build.TestBinaries) @@ -2213,6 +2221,9 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche extraEnv["AGENT_KEEP_INSTALLED"] = os.Getenv("AGENT_KEEP_INSTALLED") } + extraEnv["TEST_LONG_RUNNING"] = os.Getenv("TEST_LONG_RUNNING") + extraEnv["LONG_TEST_RUNTIME"] = os.Getenv("LONG_TEST_RUNTIME") + // these following two env vars are currently not used by anything, but can be used in the future to test beats or // other binaries, see https://github.com/elastic/elastic-agent/pull/3258 binaryName := os.Getenv("TEST_BINARY_NAME") diff --git a/pkg/testing/fixture.go b/pkg/testing/fixture.go index acb83329400..f4990f591c0 100644 --- a/pkg/testing/fixture.go +++ b/pkg/testing/fixture.go @@ -701,7 +701,10 @@ func (f *Fixture) ExecStatus(ctx context.Context, opts ...process.CmdOption) (Ag }, uerr)) } - return status, err + if err != nil { + return status, fmt.Errorf("error running command (output: %s): %w", string(out), err) + } + return status, nil } // ExecInspect executes to inspect subcommand on the prepared Elastic Agent binary. @@ -777,7 +780,7 @@ func (f *Fixture) ExecDiagnostics(ctx context.Context, cmd ...string) (string, e func (f *Fixture) IsHealthy(ctx context.Context, opts ...process.CmdOption) error { status, err := f.ExecStatus(ctx, opts...) if err != nil { - return fmt.Errorf("agent status returned and error: %w", err) + return fmt.Errorf("agent status returned an error: %w", err) } if status.State != int(cproto.State_HEALTHY) { diff --git a/pkg/testing/tools/epr.go b/pkg/testing/tools/epr.go new file mode 100644 index 00000000000..4baa0133db7 --- /dev/null +++ b/pkg/testing/tools/epr.go @@ -0,0 +1,57 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package tools + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" +) + +const eprProd = "https://epr.elastic.co" + +// / PackageSearchResult contains basic info on a package returned by a search +type PackageSearchResult struct { + Name string `json:"name"` + Version string `json:"version"` + Release string `json:"release"` + Path string `json:"path"` +} + +// GetLatestPackageRelease returns the version string of the latest package release +func GetLatestPackageRelease(ctx context.Context, packageName string) (string, error) { + endpoint := fmt.Sprintf("%s/search?package=%s&all=false", eprProd, packageName) + req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil) + if err != nil { + return "", fmt.Errorf("error creating HTTP request: %w", err) + } + resp, err := http.DefaultClient.Do(req) //nolint:gosec,nolintlint // it's a test + //create body before we check for errors, easier to format error strings that way + body, errRead := io.ReadAll(resp.Body) + if errRead != nil { + return "", fmt.Errorf("error reading body of HTTP resp: %w", err) + } + resp.Body.Close() + if err != nil { + return "", fmt.Errorf("failed to create search request for EPR (%s): %w", body, err) + } + if resp.StatusCode >= 300 { + return "", fmt.Errorf("bad status code in response from EPR: %d - %s", resp.StatusCode, resp.Status) + } + + parsedResp := []PackageSearchResult{} + err = json.Unmarshal(body, &parsedResp) + if err != nil { + return "", fmt.Errorf("error parsing search response: %w", err) + } + // if we set &all=false, we'll get at most one result + if len(parsedResp) < 1 { + return "", fmt.Errorf("no packages matching '%s' found", packageName) + } + + return parsedResp[0].Version, nil +} diff --git a/pkg/testing/tools/estools/elasticsearch.go b/pkg/testing/tools/estools/elasticsearch.go index 7e2f04cc959..8fcb4e41c0b 100644 --- a/pkg/testing/tools/estools/elasticsearch.go +++ b/pkg/testing/tools/estools/elasticsearch.go @@ -477,6 +477,44 @@ func GetLogsForAgentID(ctx context.Context, client elastictransport.Interface, i return handleDocsResponse(res) } +// GetResultsForAgentAndDatastream returns any documents match both the given agent ID and data stream +func GetResultsForAgentAndDatastream(ctx context.Context, client elastictransport.Interface, dataset string, agentID string) (Documents, error) { + indexQuery := map[string]interface{}{ + "query": map[string]interface{}{ + "bool": map[string]interface{}{ + "must": []map[string]interface{}{ + { + "match": map[string]interface{}{"data_stream.dataset": dataset}, + }, + { + "match": map[string]interface{}{"agent.id": agentID}, + }, + }, + }, + }, + } + + var buf bytes.Buffer + err := json.NewEncoder(&buf).Encode(indexQuery) + if err != nil { + return Documents{}, fmt.Errorf("error creating ES query: %w", err) + } + + es := esapi.New(client) + res, err := es.Search( + es.Search.WithExpandWildcards("all"), + es.Search.WithBody(&buf), + es.Search.WithTrackTotalHits(true), + es.Search.WithContext(ctx), + es.Search.WithSize(300), + ) + if err != nil { + return Documents{}, fmt.Errorf("error performing ES search: %w", err) + } + + return handleDocsResponse(res) +} + // GetLogsForDatasetWithContext returns any logs associated with the datastream func GetLogsForDatasetWithContext(ctx context.Context, client elastictransport.Interface, index string) (Documents, error) { indexQuery := map[string]interface{}{ @@ -546,6 +584,38 @@ func performQueryForRawQuery(ctx context.Context, queryRaw map[string]interface{ return handleDocsResponse(res) } +// FindMatchingLogLinesForAgentWithContext returns the matching `message` line field for an agent with the matching ID +func FindMatchingLogLinesForAgentWithContext(ctx context.Context, client elastictransport.Interface, agentID, line string) (Documents, error) { + queryRaw := map[string]interface{}{ + "query": map[string]interface{}{ + "bool": map[string]interface{}{ + "must": []map[string]interface{}{ + { + "match_phrase": map[string]interface{}{ + "message": line, + }, + }, + { + "term": map[string]interface{}{ + "agent.id": map[string]interface{}{ + "value": agentID, + }, + }, + }, + }, + }, + }, + } + + var buf bytes.Buffer + err := json.NewEncoder(&buf).Encode(queryRaw) + if err != nil { + return Documents{}, fmt.Errorf("error creating ES query: %w", err) + } + + return performQueryForRawQuery(ctx, queryRaw, "logs-elastic_agent*", client) +} + // GetLogsForDatastream returns any logs associated with the datastream func GetLogsForDatastream( ctx context.Context, diff --git a/pkg/testing/tools/slope.go b/pkg/testing/tools/slope.go new file mode 100644 index 00000000000..301cd7cea24 --- /dev/null +++ b/pkg/testing/tools/slope.go @@ -0,0 +1,48 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package tools + +import ( + "time" + + "github.com/sajari/regression" +) + +// Slope is a slim wrapper around a regression library for calculating rate of change over time in tests. +type Slope struct { + handler *regression.Regression +} + +func NewSlope(label string) Slope { + handler := new(regression.Regression) + handler.SetObserved(label) + handler.SetVar(0, "time") + return Slope{handler: handler} +} + +// add a datapoint and timestamp to the calculaton. +func (slope Slope) AddDatapoint(count float64, timeSinceStart time.Duration) { + slope.handler.Train(regression.DataPoint(count, []float64{timeSinceStart.Seconds()})) +} + +// Run the regression on the supplied data +func (slope Slope) Run() error { + return slope.handler.Run() +} + +// return the slope of the regression +func (slope Slope) GetSlope() float64 { + return slope.handler.GetCoeffs()[1] +} + +// Formula returns a string representation of the regression formula +func (slope Slope) Formula() string { + return slope.handler.Formula +} + +// Debug returns a string representation of the regression, including all datapoints +func (slope Slope) Debug() string { + return slope.handler.String() +} diff --git a/pkg/testing/tools/slope_test.go b/pkg/testing/tools/slope_test.go new file mode 100644 index 00000000000..cd456d68d12 --- /dev/null +++ b/pkg/testing/tools/slope_test.go @@ -0,0 +1,52 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package tools + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestSlopeMeasurement(t *testing.T) { + + testCases := []struct { + name string + datapoints []float64 + test func(slope float64, t *testing.T) + }{ + { + name: "good handle counts", + datapoints: []float64{17.00, 13.00, 16.00, 13.00, 17.00, 13.00, 16.00, 13.00, 16.00, 14.00, + 18.00, 15.00, 12.00, 16.00, 14.00, 18.00, 14.00, 17.00, 15.00, 18.00, 15.00, 18.00, 15.00, 18.00, + 14.00, 18.00, 14.00, 17.00, 13.00, + }, + test: func(slope float64, t *testing.T) { + require.LessOrEqual(t, slope, 0.01) + }, + }, + { + name: "bad handle counts", + datapoints: []float64{967, 1097, 2190, 3099, 3906, 4390, 5239, 6209, 7097, 7989, 8890, 9976, + 10957, 11679, 12907, 13806, 13969, 14898, 16103, 17207, 18109, 19459, 21004, 21947}, + test: func(slope float64, t *testing.T) { + require.Greater(t, slope, 1.00) + }, + }, + } + + for _, test := range testCases { + testHandle := NewSlope("test") + startingTime := 10 + for _, handleCount := range test.datapoints { + testHandle.AddDatapoint(handleCount, time.Second*time.Duration(startingTime)) + startingTime += 10 + } + err := testHandle.Run() + require.NoError(t, err) + test.test(testHandle.GetSlope(), t) + } +} diff --git a/testing/integration/agent_long_running_leak_test.go b/testing/integration/agent_long_running_leak_test.go new file mode 100644 index 00000000000..6b96ca13371 --- /dev/null +++ b/testing/integration/agent_long_running_leak_test.go @@ -0,0 +1,291 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package integration + +import ( + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "regexp" + "strconv" + "strings" + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" + + "github.com/elastic/elastic-agent-libs/kibana" + "github.com/elastic/elastic-agent/pkg/control/v2/cproto" + atesting "github.com/elastic/elastic-agent/pkg/testing" + "github.com/elastic/elastic-agent/pkg/testing/define" + "github.com/elastic/elastic-agent/pkg/testing/tools" + "github.com/elastic/elastic-agent/pkg/testing/tools/estools" + "github.com/elastic/go-sysinfo" + "github.com/elastic/go-sysinfo/types" +) + +type ExtendedRunner struct { + suite.Suite + info *define.Info + agentFixture *atesting.Fixture + ESHost string + healthCheckTime time.Duration + healthCheckRefreshTime time.Duration +} + +// TestComponent is used as a key in our map of component metrics +type TestComponent struct { + Binary string `mapstructure:"binary"` + Dataset string `mapstructure:"dataset"` + ID string `mapstructure:"id"` + CompType string `mapstructure:"type"` +} + +type MemoryMetrics struct { + GcNext uint64 `mapstructure:"gc_next"` + MemoryAlloc uint64 `mapstructure:"memory_alloc"` + MemorySys uint64 `mapstructure:"memory_sys"` + MemoryTotal uint64 `mapstructure:"memory_total"` + RSS uint64 `mapstructure:"rss"` +} + +type HandlesMetrics struct { + Open int `mapstructure:"open"` + Limit HandlesLimits `mapstructure:"limit"` +} + +type HandlesLimits struct { + Hard uint `mapstructure:"hard"` + Soft uint64 `mapstructure:"soft"` +} + +// MetricsSystem is used for windows handles metrics +type MetricsSystem struct { + Handles HandlesMetrics `mapstructure:"handles"` +} + +type processWatcher struct { + handle types.Process + pid int + name string + regHandles tools.Slope +} + +func TestLongRunningAgentForLeaks(t *testing.T) { + info := define.Require(t, define.Requirements{ + Group: "fleet", + Stack: &define.Stack{}, + Local: false, // requires Agent installation + Sudo: true, // requires Agent installation + OS: []define.OS{ + {Type: define.Linux}, + {Type: define.Windows}, + }, + }) + + if os.Getenv("TEST_LONG_RUNNING") == "" { + t.Skip("not running extended test unless TEST_LONG_RUNNING is set") + } + + suite.Run(t, &ExtendedRunner{info: info, healthCheckTime: time.Minute * 3, healthCheckRefreshTime: time.Second * 20}) +} + +func (runner *ExtendedRunner) SetupSuite() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + cmd := exec.CommandContext(ctx, "go", "install", "-v", "github.com/mingrammer/flog@latest") + out, err := cmd.CombinedOutput() + require.NoError(runner.T(), err, "got out: %s", string(out)) + + cmd = exec.CommandContext(ctx, "flog", "-t", "log", "-f", "apache_error", "-o", "/var/log/httpd/error_log", "-b", "50485760", "-p", "1048576") + out, err = cmd.CombinedOutput() + require.NoError(runner.T(), err, "got out: %s", string(out)) + + policyUUID := uuid.New().String() + unpr := false + installOpts := atesting.InstallOpts{ + NonInteractive: true, + Force: true, + Unprivileged: &unpr, + } + + fixture, err := define.NewFixture(runner.T(), define.Version()) + require.NoError(runner.T(), err) + runner.agentFixture = fixture + + basePolicy := kibana.AgentPolicy{ + Name: "test-policy-" + policyUUID, + Namespace: "default", + Description: "Test policy " + policyUUID, + MonitoringEnabled: []kibana.MonitoringEnabledOption{ + kibana.MonitoringEnabledLogs, + kibana.MonitoringEnabledMetrics, + }, + } + + policyResp, err := tools.InstallAgentWithPolicy(ctx, runner.T(), installOpts, runner.agentFixture, runner.info.KibanaClient, basePolicy) + require.NoError(runner.T(), err) + + // install system package + runner.InstallPackage(ctx, "system", "1.53.1", "agent_long_test_base_system_integ.json", uuid.New().String(), policyResp.ID) + + // install cef + runner.InstallPackage(ctx, "apache", "1.17.0", "agent_long_test_apache.json", uuid.New().String(), policyResp.ID) + +} + +func (runner *ExtendedRunner) InstallPackage(ctx context.Context, name string, version string, cfgFile string, policyUUID string, policyID string) { + installPackage := kibana.PackagePolicyRequest{} + + jsonRaw, err := os.ReadFile(cfgFile) + require.NoError(runner.T(), err) + + err = json.Unmarshal(jsonRaw, &installPackage) + require.NoError(runner.T(), err) + + installPackage.Package.Version = version + installPackage.ID = policyUUID + installPackage.PolicyID = policyID + installPackage.Namespace = "default" + installPackage.Name = fmt.Sprintf("%s-long-test-%s", name, policyUUID) + installPackage.Vars = map[string]interface{}{} + + runner.T().Logf("Installing %s package....", name) + _, err = runner.info.KibanaClient.InstallFleetPackage(ctx, installPackage) + require.NoError(runner.T(), err, "error creating fleet package") +} + +func (runner *ExtendedRunner) TestHandleLeak() { + ctx, cancel := context.WithTimeout(context.Background(), time.Hour) + defer cancel() + + testRuntime := os.Getenv("LONG_TEST_RUNTIME") + if testRuntime == "" { + testRuntime = "15m" + } + + status, err := runner.agentFixture.ExecStatus(ctx) + require.NoError(runner.T(), err) + + // because we need to separately fetch the PIDs, wait until everything is healthy before we look for running beats + require.Eventually(runner.T(), func() bool { + allHealthy := true + status, err := runner.agentFixture.ExecStatus(ctx) + + apacheMatch := "logfile-apache" + foundApache := false + systemMatch := "metrics-default" + foundSystem := false + + require.NoError(runner.T(), err) + for _, comp := range status.Components { + // make sure the components include the expected integrations + for _, v := range comp.Units { + runner.T().Logf("unit ID: %s", v.UnitID) + // the full unit ID will be something like "log-default-logfile-cef-3f0764f0-4ade-4f46-9ead-f2f0f7865676" + if !foundApache && strings.Contains(v.UnitID, apacheMatch) { + foundApache = true + } + if !foundSystem && strings.Contains(v.UnitID, systemMatch) { + foundSystem = true + } + } + runner.T().Logf("component state: %s", comp.Message) + if comp.State != int(cproto.State_HEALTHY) { + allHealthy = false + } + } + return allHealthy && foundApache && foundSystem + }, runner.healthCheckTime, runner.healthCheckRefreshTime, "install never became healthy") + + handles := []processWatcher{} + + // track running beats + // the `last 30s` metrics tend to report gauges, which we can't use for calculating a derivative. + // so separately fetch the PIDs + pidInStatusMessageRegex := regexp.MustCompile(`[\d]+`) + status, err = runner.agentFixture.ExecStatus(ctx) + require.NoError(runner.T(), err) + for _, comp := range status.Components { + pidStr := pidInStatusMessageRegex.FindString(comp.Message) + pid, err := strconv.ParseInt(pidStr, 10, 64) + require.NoError(runner.T(), err) + + handle, err := sysinfo.Process(int(pid)) + require.NoError(runner.T(), err) + handlesReg := tools.NewSlope(fmt.Sprintf("%s handle usage", comp.Name)) + + runner.T().Logf("created handle watcher for %s (%d)", comp.Name, pid) + handles = append(handles, processWatcher{handle: handle, pid: int(pid), name: comp.Name, regHandles: handlesReg}) + } + + testDuration, err := time.ParseDuration(testRuntime) + require.NoError(runner.T(), err) + + timer := time.NewTimer(testDuration) + defer timer.Stop() + + // time to perform a health check + ticker := time.NewTicker(time.Second * 10) + defer ticker.Stop() + + done := false + start := time.Now() + for !done { + select { + case <-timer.C: + done = true + case <-ticker.C: + err := runner.agentFixture.IsHealthy(ctx) + require.NoError(runner.T(), err) + // for each running process, collect memory and handles + for _, handle := range handles { + + ohc, ok := handle.handle.(types.OpenHandleCounter) + if ok { + handleCount, err := ohc.OpenHandleCount() + require.NoError(runner.T(), err) + handle.regHandles.AddDatapoint(float64(handleCount), time.Since(start)) + } + + } + } + } + + // we're measuring the handle usage as y=mx+b + // if the slope is increasing above a certain rate, fail the test + // A number of factors can change the slope during a test; shortened runtime (lots of handles allocated in the first few seconds, producing an upward slope), + // filebeat trying to open a large number of log files, etc + handleSlopeFailure := 0.1 + + for _, handle := range handles { + err = handle.regHandles.Run() + require.NoError(runner.T(), err) + + runner.T().Logf("=============================== %s (%d)", handle.name, handle.pid) + runner.T().Logf("handle formula: %s", handle.regHandles.Formula()) + handleSlope := handle.regHandles.GetSlope() + require.LessOrEqual(runner.T(), handleSlope, handleSlopeFailure, "increase in open handles exceeded threshold: %s", handle.regHandles.Debug()) + runner.T().Logf("===============================") + } + + // post-test: make sure that we actually ingested logs. + docs, err := estools.GetResultsForAgentAndDatastream(ctx, runner.info.ESClient, "apache.error", status.Info.ID) + assert.NoError(runner.T(), err, "error fetching apache logs") + assert.Greater(runner.T(), docs.Hits.Total.Value, 0, "could not find any matching apache logs for agent ID %s", status.Info.ID) + runner.T().Logf("Generated %d apache logs", docs.Hits.Total.Value) + + docs, err = estools.GetResultsForAgentAndDatastream(ctx, runner.info.ESClient, "system.cpu", status.Info.ID) + assert.NoError(runner.T(), err, "error fetching system metrics") + assert.Greater(runner.T(), docs.Hits.Total.Value, 0, "could not find any matching system metrics for agent ID %s", status.Info.ID) + runner.T().Logf("Generated %d system events", docs.Hits.Total.Value) +} diff --git a/testing/integration/agent_long_test_apache.json b/testing/integration/agent_long_test_apache.json new file mode 100644 index 00000000000..f1f3a6d27b2 --- /dev/null +++ b/testing/integration/agent_long_test_apache.json @@ -0,0 +1,251 @@ +{ + "id": "5cca0416-2c8e-43bb-a12f-108088a2d19c", + "version": "WzY3NiwxXQ==", + "name": "apache-1", + "namespace": "", + "description": "", + "package": { + "name": "apache", + "title": "Apache HTTP Server", + "version": "1.17.0" + }, + "enabled": true, + "inputs": [ + { + "type": "logfile", + "policy_template": "apache", + "enabled": true, + "streams": [ + { + "enabled": true, + "data_stream": { + "type": "logs", + "dataset": "apache.access" + }, + "vars": { + "paths": { + "value": [ + "/var/log/apache2/access.log*", + "/var/log/apache2/other_vhosts_access.log*", + "/var/log/httpd/access_log*" + ], + "type": "text" + }, + "tags": { + "value": [ + "apache-access" + ], + "type": "text" + }, + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "processors": { + "type": "yaml" + } + }, + "id": "logfile-apache.access-5cca0416-2c8e-43bb-a12f-108088a2d19c", + "compiled_stream": { + "paths": [ + "/var/log/apache2/access.log*", + "/var/log/apache2/other_vhosts_access.log*", + "/var/log/httpd/access_log*" + ], + "tags": [ + "apache-access" + ], + "exclude_files": [ + ".gz$" + ] + } + }, + { + "enabled": true, + "data_stream": { + "type": "logs", + "dataset": "apache.error" + }, + "vars": { + "paths": { + "value": [ + "/var/log/apache2/error.log*", + "/var/log/httpd/error_log*" + ], + "type": "text" + }, + "tags": { + "value": [ + "apache-error" + ], + "type": "text" + }, + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "processors": { + "type": "yaml" + } + }, + "id": "logfile-apache.error-5cca0416-2c8e-43bb-a12f-108088a2d19c", + "compiled_stream": { + "paths": [ + "/var/log/apache2/error.log*", + "/var/log/httpd/error_log*" + ], + "exclude_files": [ + ".gz$" + ], + "tags": [ + "apache-error" + ], + "processors": [ + { + "add_locale": null + } + ] + } + } + ], + "vars": { + "condition": { + "type": "text" + } + } + }, + { + "type": "httpjson", + "policy_template": "apache", + "enabled": false, + "streams": [ + { + "enabled": false, + "data_stream": { + "type": "logs", + "dataset": "apache.access" + }, + "vars": { + "interval": { + "value": "10s", + "type": "text" + }, + "search": { + "value": "search sourcetype=\"access*\"", + "type": "text" + }, + "tags": { + "value": [ + "forwarded", + "apache-access" + ], + "type": "text" + }, + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "processors": { + "type": "yaml" + }, + "enable_request_tracer": { + "type": "bool" + } + }, + "id": "httpjson-apache.access-5cca0416-2c8e-43bb-a12f-108088a2d19c" + }, + { + "enabled": false, + "data_stream": { + "type": "logs", + "dataset": "apache.error" + }, + "vars": { + "interval": { + "value": "10s", + "type": "text" + }, + "search": { + "value": "search sourcetype=apache:error OR sourcetype=apache_error", + "type": "text" + }, + "tags": { + "value": [ + "forwarded", + "apache-error" + ], + "type": "text" + }, + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "processors": { + "type": "yaml" + }, + "enable_request_tracer": { + "type": "bool" + } + }, + "id": "httpjson-apache.error-5cca0416-2c8e-43bb-a12f-108088a2d19c" + } + ], + "vars": { + "url": { + "value": "https://server.example.com:8089", + "type": "text" + }, + "username": { + "type": "text" + }, + "password": { + "type": "password" + }, + "token": { + "type": "password" + } + } + }, + { + "type": "apache/metrics", + "policy_template": "apache", + "enabled": false, + "streams": [ + { + "enabled": false, + "data_stream": { + "type": "metrics", + "dataset": "apache.status" + }, + "vars": { + "period": { + "value": "30s", + "type": "text" + }, + "server_status_path": { + "value": "/server-status", + "type": "text" + } + }, + "id": "apache/metrics-apache.status-5cca0416-2c8e-43bb-a12f-108088a2d19c" + } + ], + "vars": { + "hosts": { + "value": [ + "http://127.0.0.1" + ], + "type": "text" + }, + "condition": { + "type": "text" + } + } + } + ], + "revision": 1, + "created_at": "2024-02-01T16:52:06.512Z", + "created_by": "system", + "updated_at": "2024-02-01T16:52:06.512Z", + "updated_by": "system" +} \ No newline at end of file diff --git a/testing/integration/agent_long_test_base_system_integ.json b/testing/integration/agent_long_test_base_system_integ.json new file mode 100644 index 00000000000..e231beb173b --- /dev/null +++ b/testing/integration/agent_long_test_base_system_integ.json @@ -0,0 +1,788 @@ +{ + "id": "9bf446fc-58d4-4767-b42d-3450815d5d3d", + "version": "WzYzMSwxXQ==", + "name": "system-1", + "namespace": "default", + "package": { + "name": "system", + "title": "System", + "version": "1.53.0" + }, + "enabled": true, + "inputs": [ + { + "type": "logfile", + "policy_template": "system", + "enabled": true, + "streams": [ + { + "enabled": true, + "data_stream": { + "type": "logs", + "dataset": "system.auth" + }, + "vars": { + "ignore_older": { + "value": "72h", + "type": "text" + }, + "paths": { + "value": [ + "/var/log/auth.log*", + "/var/log/secure*" + ], + "type": "text" + }, + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "tags": { + "value": [ + "system-auth" + ], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "logfile-system.auth-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "ignore_older": "72h", + "paths": [ + "/var/log/auth.log*", + "/var/log/secure*" + ], + "exclude_files": [ + ".gz$" + ], + "multiline": { + "pattern": "^\\s", + "match": "after" + }, + "tags": [ + "system-auth" + ], + "processors": [ + { + "add_locale": null + }, + { + "rename": { + "fields": [ + { + "from": "message", + "to": "event.original" + } + ], + "ignore_missing": true, + "fail_on_error": false + } + }, + { + "syslog": { + "field": "event.original", + "ignore_missing": true, + "ignore_failure": true + } + } + ] + } + }, + { + "enabled": true, + "data_stream": { + "type": "logs", + "dataset": "system.syslog" + }, + "vars": { + "paths": { + "value": [ + "/var/log/messages*", + "/var/log/syslog*", + "/var/log/system*" + ], + "type": "text" + }, + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + }, + "ignore_older": { + "value": "72h", + "type": "text" + }, + "exclude_files": { + "value": [ + "\\.gz$" + ], + "type": "text" + } + }, + "id": "logfile-system.syslog-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "paths": [ + "/var/log/messages*", + "/var/log/syslog*", + "/var/log/system*" + ], + "exclude_files": [ + "\\.gz$" + ], + "multiline": { + "pattern": "^\\s", + "match": "after" + }, + "processors": [ + { + "add_locale": null + } + ], + "tags": null, + "ignore_older": "72h" + } + } + ] + }, + { + "type": "winlog", + "policy_template": "system", + "enabled": true, + "streams": [ + { + "enabled": true, + "data_stream": { + "type": "logs", + "dataset": "system.application" + }, + "vars": { + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "event_id": { + "type": "text" + }, + "ignore_older": { + "value": "72h", + "type": "text" + }, + "language": { + "value": 0, + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "winlog-system.application-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "name": "Application", + "condition": "${host.platform} == 'windows'", + "ignore_older": "72h" + } + }, + { + "enabled": true, + "data_stream": { + "type": "logs", + "dataset": "system.security" + }, + "vars": { + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "event_id": { + "type": "text" + }, + "ignore_older": { + "value": "72h", + "type": "text" + }, + "language": { + "value": 0, + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "winlog-system.security-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "name": "Security", + "condition": "${host.platform} == 'windows'", + "ignore_older": "72h" + } + }, + { + "enabled": true, + "data_stream": { + "type": "logs", + "dataset": "system.system" + }, + "vars": { + "preserve_original_event": { + "value": false, + "type": "bool" + }, + "event_id": { + "type": "text" + }, + "ignore_older": { + "value": "72h", + "type": "text" + }, + "language": { + "value": 0, + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "winlog-system.system-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "name": "System", + "condition": "${host.platform} == 'windows'", + "ignore_older": "72h" + } + } + ] + }, + { + "type": "system/metrics", + "policy_template": "system", + "enabled": true, + "streams": [ + { + "enabled": false, + "data_stream": { + "type": "metrics", + "dataset": "system.core" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "core.metrics": { + "value": [ + "percentages" + ], + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.core-9bf446fc-58d4-4767-b42d-3450815d5d3d" + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.cpu" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "cpu.metrics": { + "value": [ + "percentages", + "normalized_percentages" + ], + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.cpu-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "cpu" + ], + "cpu.metrics": [ + "percentages", + "normalized_percentages" + ], + "period": "1s" + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.diskio" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "diskio.include_devices": { + "value": [], + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + } + }, + "id": "system/metrics-system.diskio-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "diskio" + ], + "diskio.include_devices": null, + "period": "1s" + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.filesystem" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "filesystem.ignore_types": { + "value": [], + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "value": "\"\"", + "type": "yaml" + } + }, + "id": "system/metrics-system.filesystem-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "filesystem" + ], + "period": "1s", + "processors": [ + { + "drop_event.when.regexp": { + "system.filesystem.mount_point": "^/(sys|cgroup|proc|dev|etc|host|lib|snap)($|/)" + } + } + ] + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.fsstat" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "value": "\"\"", + "type": "yaml" + } + }, + "id": "system/metrics-system.fsstat-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "fsstat" + ], + "period": "1s", + "processors": [ + { + "drop_event.when.regexp": { + "system.fsstat.mount_point": "^/(sys|cgroup|proc|dev|etc|host|lib|snap)($|/)" + } + } + ] + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.load" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.load-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "load" + ], + "condition": "${host.platform} != 'windows'", + "period": "1s" + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.memory" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.memory-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "memory" + ], + "period": "1s" + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.network" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "network.interfaces": { + "value": [], + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.network-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "network" + ], + "period": "1s", + "network.interfaces": null + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.process" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "process.include_top_n.by_cpu": { + "value": 5, + "type": "integer" + }, + "process.include_top_n.by_memory": { + "value": 5, + "type": "integer" + }, + "process.cmdline.cache.enabled": { + "value": true, + "type": "bool" + }, + "process.cgroups.enabled": { + "value": false, + "type": "bool" + }, + "process.env.whitelist": { + "value": [], + "type": "text" + }, + "process.include_cpu_ticks": { + "value": false, + "type": "bool" + }, + "processes": { + "value": [ + ".*" + ], + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.process-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "process" + ], + "period": "1s", + "process.include_top_n.by_cpu": 5, + "process.include_top_n.by_memory": 5, + "process.cmdline.cache.enabled": true, + "process.cgroups.enabled": false, + "process.include_cpu_ticks": false, + "processes": [ + ".*" + ] + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.process.summary" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.process.summary-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "process_summary" + ], + "period": "1s" + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.socket_summary" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.socket_summary-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "socket_summary" + ], + "period": "1s" + } + }, + { + "enabled": true, + "data_stream": { + "type": "metrics", + "dataset": "system.uptime" + }, + "vars": { + "period": { + "value": "1s", + "type": "text" + }, + "tags": { + "value": [], + "type": "text" + }, + "processors": { + "type": "yaml" + } + }, + "id": "system/metrics-system.uptime-9bf446fc-58d4-4767-b42d-3450815d5d3d", + "compiled_stream": { + "metricsets": [ + "uptime" + ], + "period": "1s" + } + } + ], + "vars": { + "system.hostfs": { + "type": "text" + } + } + }, + { + "type": "httpjson", + "policy_template": "system", + "enabled": false, + "streams": [ + { + "enabled": false, + "data_stream": { + "type": "logs", + "dataset": "system.application" + }, + "vars": { + "interval": { + "value": "1s", + "type": "text" + }, + "search": { + "value": "search sourcetype=\"XmlWinEventLog:Application\"", + "type": "text" + }, + "tags": { + "value": [ + "forwarded" + ], + "type": "text" + } + }, + "id": "httpjson-system.application-9bf446fc-58d4-4767-b42d-3450815d5d3d" + }, + { + "enabled": false, + "data_stream": { + "type": "logs", + "dataset": "system.security" + }, + "vars": { + "interval": { + "value": "1s", + "type": "text" + }, + "search": { + "value": "search sourcetype=\"XmlWinEventLog:Security\"", + "type": "text" + }, + "tags": { + "value": [ + "forwarded" + ], + "type": "text" + } + }, + "id": "httpjson-system.security-9bf446fc-58d4-4767-b42d-3450815d5d3d" + }, + { + "enabled": false, + "data_stream": { + "type": "logs", + "dataset": "system.system" + }, + "vars": { + "interval": { + "value": "1s", + "type": "text" + }, + "search": { + "value": "search sourcetype=\"XmlWinEventLog:System\"", + "type": "text" + }, + "tags": { + "value": [ + "forwarded" + ], + "type": "text" + } + }, + "id": "httpjson-system.system-9bf446fc-58d4-4767-b42d-3450815d5d3d" + } + ], + "vars": { + "url": { + "value": "https://server.example.com:8089", + "type": "text" + }, + "enable_request_tracer": { + "type": "bool" + }, + "username": { + "type": "text" + }, + "password": { + "type": "password" + }, + "token": { + "type": "password" + }, + "preserve_original_event": { + "value": false, + "type": "bool" + } + } + } + ] + } \ No newline at end of file