diff --git a/data/templates/plugins/openshift-conformance-replay.yaml b/data/templates/plugins/openshift-conformance-replay.yaml index 068c5ec5..a799f790 100644 --- a/data/templates/plugins/openshift-conformance-replay.yaml +++ b/data/templates/plugins/openshift-conformance-replay.yaml @@ -29,13 +29,13 @@ podSpec: --certificate-authority="${SA_CA_PATH}"; env: - name: KUBECONFIG - value: "/tmp/shared/kubeconfig" + value: /tmp/shared/kubeconfig - name: KUBE_API_URL value: "https://172.30.0.1:443" - name: SA_TOKEN_PATH - value: "/var/run/secrets/kubernetes.io/serviceaccount/token" + value: /var/run/secrets/kubernetes.io/serviceaccount/token - name: SA_CA_PATH - value: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + value: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt volumeMounts: - mountPath: /tmp/shared name: shared @@ -51,11 +51,11 @@ podSpec: - name: KUBECONFIG value: /tmp/shared/kubeconfig - name: PLUGIN_NAME - value: "openshift-tests-replay" + value: openshift-tests-replay - name: DEFAULT_SUITE_NAME - value: "all" + value: all - name: OT_RUN_COMMAND - value: "run" + value: run sonobuoy-config: driver: Job @@ -64,9 +64,9 @@ sonobuoy-config: description: | OPCT plugin to collect e2e failures from previous executions and schedule a new execution running in serial mode with openshift-tests. - source-url: - "https://github.com/redhat-openshift-ecosystem/provider-certification-tool/\ - blob/main/manifests/openshift-conformance-validated.yaml" + source-url: | + https://github.com/redhat-openshift-ecosystem/provider-certification-tool/\ + blob/main/manifests/openshift-conformance-validated.yaml skipCleanup: true spec: name: plugin @@ -85,7 +85,7 @@ spec: - name: KUBECONFIG value: /tmp/shared/kubeconfig - name: PLUGIN_NAME - value: "openshift-tests-replay" + value: openshift-tests-replay - name: PLUGIN_ID value: "80" - name: ENV_NODE_NAME diff --git a/docs/opct/report.md b/docs/opct/report.md index 4d3a679b..9795a47f 100644 --- a/docs/opct/report.md +++ b/docs/opct/report.md @@ -1 +1,21 @@ -# opct report \ No newline at end of file +# opct report + +## Usage + + +## Examples + +### Running the default conformance workflow + + + +### Development + +```sh +VERSION=v0.0.0-devel-d4745f8 +opct-devel destroy; opct-devel run -w --devel-limit-tests=10 --log-level=debug \ +--plugins-image=quay.io/opct/plugin-openshift-tests:${VERSION} \ +--collector-image=quay.io/opct/plugin-artifacts-collector:${VERSION} \ +--must-gather-monitoring-image=quay.io/opct/must-gather-monitoring:${VERSION}; +opct-devel retrieve +``` diff --git a/internal/opct/plugin/plugin.go b/internal/opct/plugin/plugin.go index 4bc2cab1..798261bb 100644 --- a/internal/opct/plugin/plugin.go +++ b/internal/opct/plugin/plugin.go @@ -81,7 +81,7 @@ type OPCTPluginSummary struct { // It should not be used to exclude failures from the report of e2e included in suite, // but to remove known flake/failures that is not relevant to the pipeline. // Example: '[sig-arch] External binary usage' - Filter5KnownFailures []string + // Filter5KnownFailures []string FailedFilter5 []string FailedExcludedFilter5 []string @@ -113,4 +113,70 @@ func (ps *OPCTPluginSummary) GetErrorCounters() *archive.ErrorCounter { return ps.calculateErrorCounter() } -// +const ( + // FilterNameSuiteOnly is the filter to remove failures of tests not included in the suite. + FilterNameSuiteOnly = "suite-only" + + // FilterID1 is the filter to exclude known failures from the OPCT CI. + FilterNameKF = "known-failures" + + FilterNameBaseline = "baseline" + + FilterNameFlaky = "flaky" + + FilterNameReplay = "replay" + + FilterNameFinalCopy = "copy" +) + +// TODO: move to a chain stack to allow multiple filters. +func (ps *OPCTPluginSummary) GetFailuresByFilterID(filterID string) ([]string, []string) { + switch filterID { + case FilterNameSuiteOnly: + return ps.FailedFilter1, ps.FailedExcludedFilter1 + case FilterNameBaseline: + return ps.FailedFilter2, ps.FailedExcludedFilter2 + case FilterNameKF: + return ps.FailedFilter5, ps.FailedExcludedFilter5 + case FilterNameReplay: + return ps.FailedFilter6, ps.FailedExcludedFilter6 + } + return nil, nil +} + +func (ps *OPCTPluginSummary) SetFailuresByFilterID(filterID string, failures []string, excluded []string) { + switch filterID { + case FilterNameSuiteOnly: + ps.FailedFilter1 = failures + ps.FailedExcludedFilter1 = excluded + return + case FilterNameBaseline: + ps.FailedFilter2 = failures + ps.FailedExcludedFilter2 = excluded + return + case FilterNameKF: + ps.FailedFilter5 = failures + ps.FailedExcludedFilter5 = excluded + return + case FilterNameReplay: + ps.FailedFilter6 = failures + ps.FailedExcludedFilter6 = excluded + return + } +} + +func (ps *OPCTPluginSummary) GetPreviousFailuresByFilterID(filterID string) []string { + switch filterID { + case FilterNameSuiteOnly: + return nil + case FilterNameKF: + return ps.FailedFilter1 // SuiteOnly + case FilterNameReplay: + return ps.FailedFilter5 // KnownFailures + case FilterNameBaseline: + return ps.FailedFilter6 // Replay + case FilterNameFinalCopy: + return ps.FailedFilter4 // BaselineAPI + } + return nil +} diff --git a/internal/opct/summary/consolidated.go b/internal/opct/summary/consolidated.go index 3a487f47..7981a63f 100644 --- a/internal/opct/summary/consolidated.go +++ b/internal/opct/summary/consolidated.go @@ -91,15 +91,27 @@ func (cs *ConsolidatedSummary) Process() error { return err } + log.Debug("Processing results/Applying filters/5/Known Failures") + cs.Timers.Set("cs-process/filter5-known-failures") + if err := cs.applyFilterKnownFailures(plugin.FilterNameKF); err != nil { + return err + } + + log.Debug("Processing results/Applying filters/6/Replay") + cs.Timers.Set("cs-process/filter5-known-failures") + if err := cs.applyFilterReplay(plugin.FilterNameReplay); err != nil { + return err + } + log.Debug("Processing results/Applying filters/2/Baseline") cs.Timers.Set("cs-process/filter2-baseline") - if err := cs.applyFilterBaseline(); err != nil { + if err := cs.applyFilterBaseline(plugin.FilterNameBaseline); err != nil { return err } log.Debug("Processing results/Applying filters/3/Flake") cs.Timers.Set("cs-process/filter3-flake") - if err := cs.applyFilterFlaky(); err != nil { + if err := cs.applyFilterFlaky(plugin.FilterNameFlaky); err != nil { return err } @@ -109,20 +121,9 @@ func (cs *ConsolidatedSummary) Process() error { return err } - log.Debug("Processing results/Applying filters/5/Known Failures") - cs.Timers.Set("cs-process/filter5-known-failures") - if err := cs.applyFilterKnownFailures(); err != nil { - return err - } - log.Debug("Processing results/Applying filters/6/Replay") - cs.Timers.Set("cs-process/filter5-known-failures") - if err := cs.applyFilterReplay(); err != nil { - return err - } - log.Debug("Processing results/Applying filters/Saving final filter") cs.Timers.Set("cs-process/filter5-known-failures") - if err := cs.applyFilterCopyPipeline(); err != nil { + if err := cs.applyFilterCopyPipeline(plugin.FilterNameFinalCopy); err != nil { return err } @@ -230,14 +231,14 @@ func (cs *ConsolidatedSummary) applyFilterSuiteForPlugin(pluginName string) erro // Filter2: Baseline archive // applyFilterBaseline process the FailedFilterSuite for each plugin, **excluding** failures from // baseline test. -func (cs *ConsolidatedSummary) applyFilterBaseline() error { +func (cs *ConsolidatedSummary) applyFilterBaseline(filterID string) error { for _, pluginName := range []string{ plugin.PluginNameOpenShiftUpgrade, plugin.PluginNameKubernetesConformance, plugin.PluginNameOpenShiftConformance, plugin.PluginNameConformanceReplay, } { - if err := cs.applyFilterBaselineForPlugin(pluginName); err != nil { + if err := cs.applyFilterBaselineForPlugin(pluginName, filterID); err != nil { return fmt.Errorf("error while processing filter2 (baseline archive): %w", err) } } @@ -246,7 +247,7 @@ func (cs *ConsolidatedSummary) applyFilterBaseline() error { // applyFilterBaselineForPlugin calculates the **exclusion** tests of // Provider Failed included on suite and Baseline failed tests. -func (cs *ConsolidatedSummary) applyFilterBaselineForPlugin(pluginName string) error { +func (cs *ConsolidatedSummary) applyFilterBaselineForPlugin(pluginName string, filterID string) error { var ps *plugin.OPCTPluginSummary var e2eFailuresBaseline []string @@ -276,7 +277,8 @@ func (cs *ConsolidatedSummary) applyFilterBaselineForPlugin(pluginName string) e return errors.New("Suite not found to apply filter: Flaky") } - e2eFailuresProvider := ps.FailedFilter1 + filterFailures, filterFailuresExcluded := ps.GetFailuresByFilterID(filterID) + e2eFailuresProvider := ps.GetPreviousFailuresByFilterID(filterID) hashBaseline := make(map[string]struct{}, len(e2eFailuresBaseline)) for _, v := range e2eFailuresBaseline { @@ -286,27 +288,28 @@ func (cs *ConsolidatedSummary) applyFilterBaselineForPlugin(pluginName string) e for _, v := range e2eFailuresProvider { ps.Tests[v].State = "filter2Baseline" if _, ok := hashBaseline[v]; !ok { - ps.FailedFilter2 = append(ps.FailedFilter2, v) + filterFailures = append(filterFailures, v) continue } - ps.FailedExcludedFilter2 = append(ps.FailedExcludedFilter2, v) + filterFailuresExcluded = append(filterFailuresExcluded, v) } - sort.Strings(ps.FailedFilter2) + sort.Strings(filterFailures) + ps.SetFailuresByFilterID(filterID, filterFailures, filterFailuresExcluded) log.Debugf("Filter2 (Baseline) results: plugin=%s in=filter(%d) out=filter(%d) filterExcluded(%d)", - pluginName, len(ps.FailedFilter1), - len(ps.FailedFilter2), len(ps.FailedExcludedFilter2)) + pluginName, len(e2eFailuresProvider), + len(filterFailures), len(filterFailuresExcluded)) return nil } // Filter3: Flaky // applyFilterFlaky process the FailedFilterSuite for each plugin, **excluding** failures from // baseline test. -func (cs *ConsolidatedSummary) applyFilterFlaky() error { - if err := cs.applyFilterFlakeForPlugin(plugin.PluginNameKubernetesConformance); err != nil { +func (cs *ConsolidatedSummary) applyFilterFlaky(filterID string) error { + if err := cs.applyFilterFlakeForPlugin(plugin.PluginNameKubernetesConformance, filterID); err != nil { return err } - if err := cs.applyFilterFlakeForPlugin(plugin.PluginNameOpenShiftConformance); err != nil { + if err := cs.applyFilterFlakeForPlugin(plugin.PluginNameOpenShiftConformance, filterID); err != nil { return err } return nil @@ -314,7 +317,7 @@ func (cs *ConsolidatedSummary) applyFilterFlaky() error { // applyFilterFlakeForPlugin query the Sippy API looking for each failed test // on each plugin/suite, saving the list on the ResultSummary. -func (cs *ConsolidatedSummary) applyFilterFlakeForPlugin(pluginName string) error { +func (cs *ConsolidatedSummary) applyFilterFlakeForPlugin(pluginName string, filterID string) error { var ps *plugin.OPCTPluginSummary switch pluginName { @@ -378,7 +381,7 @@ func (cs *ConsolidatedSummary) applyFilterFlakeForPlugin(pluginName string) erro } sort.Strings(ps.FailedFilter3) - log.Debugf("Filter3 (FlakeAPI) results: plugin=%s in=filter(%d) out=filter(%d) filterExcluded(%d)", + log.Debugf("Filter (FlakeAPI) results: plugin=%s in=filter(%d) out=filter(%d) filterExcluded(%d)", pluginName, len(ps.FailedFilter2), len(ps.FailedFilter3), len(ps.FailedExcludedFilter3)) return nil @@ -515,7 +518,7 @@ func (cs *ConsolidatedSummary) applyFilterBaselineAPIForPlugin(pluginName string // Filter5: Known Failures // applyFilterKnownFailures skip well known failures that are not relevant to the validation process. -func (cs *ConsolidatedSummary) applyFilterKnownFailures() error { +func (cs *ConsolidatedSummary) applyFilterKnownFailures(filterID string) error { // Reason to skip the test: // "[sig-arch] External binary usage" : // - The test is not relevant to the validation process, and it's not a real failure @@ -535,7 +538,7 @@ func (cs *ConsolidatedSummary) applyFilterKnownFailures() error { plugin.PluginNameOpenShiftConformance, plugin.PluginNameConformanceReplay, } { - if err := cs.applyFilterKnownFailuresForPlugin(pluginName); err != nil { + if err := cs.applyFilterKnownFailuresForPlugin(pluginName, filterID); err != nil { return fmt.Errorf("error while processing filter5 (baseline API): %w", err) } } @@ -543,7 +546,7 @@ func (cs *ConsolidatedSummary) applyFilterKnownFailures() error { } // Filter5 by plugin -func (cs *ConsolidatedSummary) applyFilterKnownFailuresForPlugin(pluginName string) error { +func (cs *ConsolidatedSummary) applyFilterKnownFailuresForPlugin(pluginName string, filterID string) error { var ps *plugin.OPCTPluginSummary // Get the list of the last filter in the pipeline @@ -565,7 +568,8 @@ func (cs *ConsolidatedSummary) applyFilterKnownFailuresForPlugin(pluginName stri } // read the failures from pipeline - e2eFailuresPipeline := ps.FailedFilter4 + filterFailures, filterFailuresExcluded := ps.GetFailuresByFilterID(filterID) + e2eFailuresPipeline := ps.GetPreviousFailuresByFilterID(filterID) hashExclusion := make(map[string]struct{}, len(cs.Provider.TestSuiteKnownFailures)) for _, v := range cs.Provider.TestSuiteKnownFailures { @@ -575,15 +579,16 @@ func (cs *ConsolidatedSummary) applyFilterKnownFailuresForPlugin(pluginName stri for _, v := range e2eFailuresPipeline { ps.Tests[v].State = "filter5KnownFailures" if _, ok := hashExclusion[v]; !ok { - ps.FailedFilter5 = append(ps.FailedFilter5, v) + filterFailures = append(filterFailures, v) continue } - ps.FailedExcludedFilter5 = append(ps.FailedExcludedFilter5, v) + filterFailuresExcluded = append(filterFailuresExcluded, v) } - sort.Strings(ps.FailedFilter5) + sort.Strings(filterFailures) + ps.SetFailuresByFilterID(filterID, filterFailures, filterFailuresExcluded) + log.Debugf("Filter5 (KF) results: plugin=%s in=filter(%d) out=filter(%d) filterExcluded(%d)", - pluginName, len(ps.FailedFilter4), - len(ps.FailedFilter5), len(ps.FailedExcludedFilter5)) + pluginName, len(e2eFailuresPipeline), len(filterFailures), len(filterFailuresExcluded)) return nil } @@ -592,12 +597,12 @@ func (cs *ConsolidatedSummary) applyFilterKnownFailuresForPlugin(pluginName stri // candidate for flake or false-positive failure. // Replay step re-runs the failured tests from conformance suites in serial mode, // to check if the test is passing in a second shot. -func (cs *ConsolidatedSummary) applyFilterReplay() error { +func (cs *ConsolidatedSummary) applyFilterReplay(filterID string) error { for _, pluginName := range []string{ plugin.PluginNameKubernetesConformance, plugin.PluginNameOpenShiftConformance, } { - if err := cs.applyFilterReplayForPlugin(pluginName); err != nil { + if err := cs.applyFilterReplayForPlugin(pluginName, filterID); err != nil { return fmt.Errorf("error while processing filter5 (Replay): %w", err) } } @@ -608,7 +613,7 @@ func (cs *ConsolidatedSummary) applyFilterReplay() error { // applyFilterReplayForPlugin extracts passed tests from replay step, and check // if conformance plugins has intersection in its failures, if so the test is passing // in the second run, excluding it from the failures. -func (cs *ConsolidatedSummary) applyFilterReplayForPlugin(pluginName string) error { +func (cs *ConsolidatedSummary) applyFilterReplayForPlugin(pluginName string, filterID string) error { var ps *plugin.OPCTPluginSummary switch pluginName { case plugin.PluginNameKubernetesConformance: @@ -617,19 +622,24 @@ func (cs *ConsolidatedSummary) applyFilterReplayForPlugin(pluginName string) err case plugin.PluginNameOpenShiftConformance: ps = cs.GetProvider().GetOpenShift().GetResultOCPValidated() + case plugin.PluginNameOpenShiftUpgrade: + ps = cs.GetProvider().GetOpenShift().GetResultConformanceUpgrade() + default: return fmt.Errorf("plugin not found: %s", pluginName) } // read the failures from pipeline - e2eFailuresPipeline := ps.FailedFilter5 + filterFailures, filterFailuresExcluded := ps.GetFailuresByFilterID(filterID) + e2eFailuresPipeline := ps.GetPreviousFailuresByFilterID(filterID) replayPlugin := cs.GetProvider().GetOpenShift().GetResultConformanceReplay() if replayPlugin == nil { - log.Debugf("Filter6 (Replay) results: plugin=%s in=filter(%d) out=filter(%d) filterExcluded(%d)", - pluginName, len(ps.FailedFilter5), - len(ps.FailedFilter6), len(ps.FailedExcludedFilter6)) - log.Debugf("skipping filter6 (Replay) for plugin: %s, no replay results", pluginName) + ps.SetFailuresByFilterID(filterID, filterFailures, filterFailuresExcluded) + log.Debugf("Filter (Replay) results: plugin=%s in=filter(%d) out=filter(%d) filterExcluded(%d)", + pluginName, len(e2eFailuresPipeline), + len(filterFailures), len(filterFailuresExcluded)) + log.Debugf("skipping filter (Replay) for plugin: %s, no replay results", pluginName) return nil } @@ -647,29 +657,30 @@ func (cs *ConsolidatedSummary) applyFilterReplayForPlugin(pluginName string) err for _, v := range e2eFailuresPipeline { ps.Tests[v].State = "filter6Replay" if _, ok := passedReplay[v]; !ok { - ps.FailedFilter6 = append(ps.FailedFilter6, v) + filterFailures = append(filterFailures, v) continue } - ps.FailedExcludedFilter6 = append(ps.FailedExcludedFilter6, v) + filterFailuresExcluded = append(filterFailuresExcluded, v) } - sort.Strings(ps.FailedFilter6) + sort.Strings(filterFailures) + ps.SetFailuresByFilterID(filterID, filterFailures, filterFailuresExcluded) log.Debugf("Filter6 (Replay) results: plugin=%s in=filter(%d) replay=pass(%d) fail(%d) out=filter(%d) filterExcluded(%d)", - pluginName, len(ps.FailedFilter5), len(passedReplay), len(failedReplay), - len(ps.FailedFilter6), len(ps.FailedExcludedFilter6)) + pluginName, len(e2eFailuresPipeline), len(passedReplay), len(failedReplay), + len(filterFailures), len(filterFailuresExcluded)) return nil } // Filter Final: // applyFilterCopyPipeline builds the final failures after filters for each plugin. -func (cs *ConsolidatedSummary) applyFilterCopyPipeline() error { +func (cs *ConsolidatedSummary) applyFilterCopyPipeline(filterID string) error { for _, pluginName := range []string{ plugin.PluginNameOpenShiftUpgrade, plugin.PluginNameKubernetesConformance, plugin.PluginNameOpenShiftConformance, plugin.PluginNameConformanceReplay, } { - if err := cs.applyFilterCopyPipelineForPlugin(pluginName); err != nil { + if err := cs.applyFilterCopyPipelineForPlugin(pluginName, filterID); err != nil { return fmt.Errorf("error while building filtered failures: %w", err) } } @@ -677,7 +688,7 @@ func (cs *ConsolidatedSummary) applyFilterCopyPipeline() error { } // applyFilterCopyPipelineForPlugin copy the last filter in the pipeline to the final result of failures. -func (cs *ConsolidatedSummary) applyFilterCopyPipelineForPlugin(pluginName string) error { +func (cs *ConsolidatedSummary) applyFilterCopyPipelineForPlugin(pluginName string, filterID string) error { var ps *plugin.OPCTPluginSummary // Get the list of the last filter in the pipeline @@ -685,17 +696,17 @@ func (cs *ConsolidatedSummary) applyFilterCopyPipelineForPlugin(pluginName strin case plugin.PluginNameKubernetesConformance: ps = cs.GetProvider().GetOpenShift().GetResultK8SValidated() // Should point to the last filter in the pipeline. - ps.FailedFiltered = ps.FailedFilter6 + ps.FailedFiltered = ps.GetPreviousFailuresByFilterID(filterID) case plugin.PluginNameOpenShiftConformance: ps = cs.GetProvider().GetOpenShift().GetResultOCPValidated() // Should point to the last filter in the pipeline. - ps.FailedFiltered = ps.FailedFilter6 + ps.FailedFiltered = ps.GetPreviousFailuresByFilterID(filterID) case plugin.PluginNameOpenShiftUpgrade: ps = cs.GetProvider().GetOpenShift().GetResultConformanceUpgrade() // Should point to the last filter in the pipeline. - ps.FailedFiltered = ps.FailedFilter5 + ps.FailedFiltered = ps.GetPreviousFailuresByFilterID(filterID) case plugin.PluginNameConformanceReplay: ps = cs.GetProvider().GetOpenShift().GetResultConformanceReplay() diff --git a/internal/opct/summary/openshift.go b/internal/opct/summary/openshift.go index e2ac77f3..ab23a460 100644 --- a/internal/opct/summary/openshift.go +++ b/internal/opct/summary/openshift.go @@ -59,16 +59,18 @@ type SummaryOpenShiftClusterNetworkV1 = configv1.Network type SummaryOpenShiftNetworkV1 = configv1.Network type Node struct { - Hostname string `json:"hostname,omitempty"` - Architecture string `json:"architecture,omitempty"` - OperatingSystem string `json:"os,omitempty"` - OperatingSystemId string `json:"osId,omitempty"` - CreationDate string `json:"creationDate,omitempty"` - NodeRoles string `json:"nodeRoles,omitempty"` - TaintsNodeRole string `json:"taints,omitempty"` - CapacityCPU string `json:"capacityCpu,omitempty"` - CapacityStorageGB string `json:"capacityStorageGB,omitempty"` - CapacityMemGB string `json:"capacityMemGB,omitempty"` + Hostname string `json:"hostname,omitempty"` + Architecture string `json:"architecture,omitempty"` + OperatingSystem string `json:"os,omitempty"` + OperatingSystemId string `json:"osId,omitempty"` + CreationDate string `json:"creationDate,omitempty"` + NodeRoles string `json:"nodeRoles,omitempty"` + TaintsNodeRole string `json:"taints,omitempty"` + CapacityCPU string `json:"capacityCpu,omitempty"` + CapacityStorageGB string `json:"capacityStorageGB,omitempty"` + CapacityMemGB string `json:"capacityMemGB,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + ControlPlane bool `json:"controlPlane,omitempty"` } func NewOpenShiftSummary() *OpenShiftSummary { @@ -241,6 +243,7 @@ func (os *OpenShiftSummary) SetNodes(nodes *v1.NodeList) error { CapacityStorageGB: sizeToHuman(node.Status.Capacity.StorageEphemeral().String()), CapacityMemGB: sizeToHuman(node.Status.Capacity.Memory().String()), CreationDate: node.GetObjectMeta().GetCreationTimestamp().String(), + Labels: make(map[string]string), } // parse labels for label, value := range node.GetObjectMeta().GetLabels() { @@ -257,9 +260,15 @@ func (os *OpenShiftSummary) SetNodes(nodes *v1.NodeList) error { case "node.openshift.io/os_id": customNode.OperatingSystemId = value continue + case "topology.kubernetes.io/zone": + customNode.Labels["topology.kubernetes.io/zone"] = value + continue } if strings.HasPrefix(label, "node-role.kubernetes.io") { if roleArr := strings.Split(label, "node-role.kubernetes.io/"); len(roleArr) == 2 { + if roleArr[1] == "master" || roleArr[1] == "control-plane" { + customNode.ControlPlane = true + } customNode.NodeRoles += fmt.Sprintf("%s ", roleArr[1]) continue } diff --git a/internal/opct/summary/result.go b/internal/opct/summary/result.go index ea215380..621cacbd 100644 --- a/internal/opct/summary/result.go +++ b/internal/opct/summary/result.go @@ -390,11 +390,11 @@ func (rs *ResultSummary) extractAndLoadData() error { if err := results.ExtractFileIntoStruct(pathResourceNsKubeConfigMap, path, info, &kubeSystemConfigMapList); err != nil { return errors.Wrap(err, fmt.Sprintf("extracting file '%s': %v", path, err)) } + if warn := results.ExtractBytes(pathMustGather, path, info, &mustGather); warn != nil { + log.Warnf("Unable to load file %s: %v\n", pathMustGather, warn) + return errors.Wrap(warn, fmt.Sprintf("extracting file '%s': %v", path, warn)) + } if saveToFlagEnabled { - if warn := results.ExtractBytes(pathMustGather, path, info, &mustGather); warn != nil { - log.Warnf("Unable to load file %s: %v\n", pathMustGather, warn) - return errors.Wrap(warn, fmt.Sprintf("extracting file '%s': %v", path, warn)) - } if warn := results.ExtractBytes(pathCAMIG, path, info, &CAMGI); warn != nil { log.Warnf("Unable to load file %s: %v\n", pathCAMIG, warn) return errors.Wrap(warn, fmt.Sprintf("extracting file '%s': %v", path, warn)) @@ -431,49 +431,53 @@ func (rs *ResultSummary) extractAndLoadData() error { return e }) if err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Extracting/result: %v", err) } log.Debugf("Processing results/Populating/Populating Summary/Processing") if err := rs.GetSonobuoy().SetCluster(&sbCluster); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Sonobuoy: %v", err) } if err := rs.GetOpenShift().SetInfrastructure(&ocpInfra); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Object/Infrastructure: %v", err) } if err := rs.GetOpenShift().SetClusterVersion(&ocpCV); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Object/Version: %v", err) } if err := rs.GetOpenShift().SetClusterOperators(&ocpCO); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Object/Operators: %v", err) } if err := rs.GetOpenShift().SetClusterNetwork(&ocpCN); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Object/Network: %v", err) } if err := rs.GetOpenShift().SetNodes(&nodes); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Object/Nodes: %v", err) } if err := rs.Suites.KubernetesConformance.Load(pathPluginArtifactTestsK8S, &testsSuiteK8S); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Plugin/kube: %v", err) } if err := rs.Suites.OpenshiftConformance.Load(pathPluginArtifactTestsOCP, &testsSuiteOCP); err != nil { - return err + log.Warnf("Processing results/Populating/Populating Summary/Processing/Plugin/openshift: %v", err) } rs.GetSonobuoy().SetPluginDefinition(plugin.PluginNameKubernetesConformance, &pluginDef10) rs.GetSonobuoy().SetPluginDefinition(plugin.PluginNameOpenShiftConformance, &pluginDef20) + rs.GetSonobuoy().ParseMetaRunlogs(&metaRunLogs) rs.GetSonobuoy().ParseMetaConfig(&metaConfig) rs.GetSonobuoy().ParseOpctConfigMap(&opctConfigMapList) + // TODO the must-gather parser is consuming more resource than expected, need to be + // reviewed, and parsers and queue handlers refactored. + log.Debugf("Processing results/Populating/Populating Summary/Processing/MustGather") + rs.MustGather = mustgather.NewMustGather(fmt.Sprintf("%s/must-gather", rs.SavePath), saveToFlagEnabled) + if err := rs.MustGather.Process(&mustGather); err != nil { + log.Errorf("Processing results/Populating/Populating Summary/Processing/MustGather: %v", err) + } else { + log.Debugf("Processing results/Populating/Populating Summary/Processing/MustGather/CalculatingErrors") + rs.MustGather.AggregateCounters() + } + if saveToFlagEnabled { - log.Debugf("Processing results/Populating/Populating Summary/Processing/MustGather") - rs.MustGather = mustgather.NewMustGather(fmt.Sprintf("%s/must-gather", rs.SavePath)) - if err := rs.MustGather.Process(&mustGather); err != nil { - log.Errorf("Processing results/Populating/Populating Summary/Processing/MustGather: %v", err) - } else { - log.Debugf("Processing results/Populating/Populating Summary/Processing/MustGather/CalculatingErrors") - rs.MustGather.AggregateCounters() - } if len(CAMGI.Bytes()) > 0 { err = os.WriteFile(fmt.Sprintf("%s/%s", rs.SavePath, filepath.Base(pathCAMIG)), CAMGI.Bytes(), 0644) if err != nil { diff --git a/internal/openshift/mustgather/mustgather.go b/internal/openshift/mustgather/mustgather.go index 23754501..ed3415f9 100644 --- a/internal/openshift/mustgather/mustgather.go +++ b/internal/openshift/mustgather/mustgather.go @@ -3,6 +3,7 @@ package mustgather import ( "archive/tar" "bytes" + "fmt" "io" "os" "path/filepath" @@ -26,6 +27,7 @@ type rawFile struct { type MustGather struct { // path to the directory must-gather will be saved. path string + save bool // ErrorEtcdLogs summary of etcd errors parsed from must-gather. ErrorEtcdLogs *ErrorEtcdLogs `json:"ErrorEtcdLogs,omitempty"` @@ -45,22 +47,23 @@ type MustGather struct { PodNetworkChecks MustGatherPodNetworkChecks } -func NewMustGather(file string) *MustGather { +func NewMustGather(file string, save bool) *MustGather { return &MustGather{ path: file, + save: save, } } -// Process read the must-gather tarball. +// Process reads and process in memory the must-gather tarball file. func (mg *MustGather) Process(buf *bytes.Buffer) error { log.Debugf("Processing results/Populating/Populating Summary/Processing/MustGather/Reading") tar, err := getTarFromXZBuffer(buf) if err != nil { return err } + log.Debugf("Processing results/Populating/Populating Summary/Processing/MustGather/Processing") - err = mg.extract(tar) - if err != nil { + if err := mg.extract(tar); err != nil { return err } return nil @@ -124,6 +127,7 @@ func (mg *MustGather) AggregateCounters() { } } } + log.Debugf("Processing results/Populating/Populating Summary/Processing/MustGather/CalculatingErrors/CalculatingEtcdErrors") mg.calculateCountersEtcd() } @@ -162,13 +166,15 @@ func (mg *MustGather) calculateCountersEtcd() { mg.ErrorEtcdLogs.FilterRequestSlowAll = filterATTL2.GetStat(1) } -// extract read the tarball and extract the required information, scanning all files. +// extract reads, and process the tarball and extract the required information. func (mg *MustGather) extract(tarball *tar.Reader) error { - // Create must-gather directory under the result path. - if _, err := os.Stat(mg.path); err != nil { - if err := os.MkdirAll(mg.path, 0755); err != nil { - return err + // Creates directory only when needs it. + if mg.save { + if _, err := os.Stat(mg.path); err != nil { + if err := os.MkdirAll(mg.path, 0755); err != nil { + return fmt.Errorf("error creating must-gather directory: %v", err) + } } } @@ -198,11 +204,13 @@ func (mg *MustGather) extract(tarball *tar.Reader) error { // the target location where the dir/file should be created. target := filepath.Join(mg.path, header.Name) - ok, itemType := matchToExtract(target) + + // check if the file should be processed. + ok, itemType := getFileTypeToProcess(target) if !ok { continue } - targetAlias := extractRelativePath(target) + targetAlias := normalizeRelativePath(target) // the following switch could also be done using fi.Mode(), not sure if there // a benefit of using one vs. the other. @@ -214,6 +222,7 @@ func (mg *MustGather) extract(tarball *tar.Reader) error { case tar.TypeDir: // creating subdirectories structures will be ignored and need // sub-directories under mg.path must be created previously if needed. + // Enable it only there is a use case to extract more data to disk preserving source dirs. /* targetDir := filepath.Join(mg.path, targetAlias) if _, err := os.Stat(targetDir); err != nil { @@ -230,13 +239,12 @@ func (mg *MustGather) extract(tarball *tar.Reader) error { // all the files in must-gather, extracting only information required by OPCT. switch itemType { case patternNamePodLogs: - // parallel processing the logs + // logs are processed in parallel, the buffer is released when processed. buf := bytes.Buffer{} if _, err := io.Copy(&buf, tarball); err != nil { - return err + log.Errorf("must-gather processor/podLogs: error copying buffer for %s: %v", targetAlias, err) + continue } - // waiterProcNS.Add(1) - // procQueueInc() processorBucket.Incremet() go func(filename string, buffer *bytes.Buffer) { processorBucket.AppendQueue(&MustGatherLog{ @@ -246,6 +254,11 @@ func (mg *MustGather) extract(tarball *tar.Reader) error { }(targetAlias, &buf) case patternNameEvents: + // skip extracting when save directory is not set. (in-memory processing only) + if !mg.save { + log.Debugf("skipping file %s", targetAlias) + continue + } // forcing file name for event filter targetLocal := filepath.Join(mg.path, "event-filter.html") f, err := os.OpenFile(targetLocal, os.O_CREATE|os.O_RDWR, os.FileMode(header.Mode)) @@ -308,8 +321,10 @@ func (mg *MustGather) processNamespaceErrors(mgLog *MustGatherLog) { mgLog.Pod = mgItems[2] mgLog.Container = mgItems[3] + // parse errors from logs mgLog.ErrorCounters = archive.NewErrorCounter(ptr.To(mgLog.buffer.String()), archive.CommonErrorPatterns) - // additional parsers + + // additional parsers: etcd error counter extractor if mgLog.Namespace == "openshift-etcd" && mgLog.Container == "etcd" && strings.HasSuffix(mgLog.Path, "current.log") { @@ -325,4 +340,7 @@ func (mg *MustGather) processNamespaceErrors(mgLog *MustGatherLog) { log.Errorf("one or more errors found when inserting errors: %v", err) } } + + // release buffer + mgLog.buffer.Reset() } diff --git a/internal/openshift/mustgather/podnetconcheck.go b/internal/openshift/mustgather/podnetconcheck.go index 92c33697..c374eabd 100644 --- a/internal/openshift/mustgather/podnetconcheck.go +++ b/internal/openshift/mustgather/podnetconcheck.go @@ -54,23 +54,24 @@ func (p *MustGatherPodNetworkChecks) Parse(data map[string]interface{}) { // TODO#1 use CRD PodNetworkConnectivityCheck and api controlplane.operator.openshift.io/v1alpha1 to parse // TODO#2 use reflection to read data + prefixErr := "must-gather extracting file pod_network_connectivity_check" for _, d := range data["items"].([]interface{}) { item := d.(map[interface{}]interface{}) if item["metadata"] == nil { - log.Errorf("unable to retrieve pod network check metadata: %v", item["metadata"]) + log.Debugf("%s/invalid metadata: %v", prefixErr, item["metadata"]) continue } metadata := item["metadata"].(map[interface{}]interface{}) if item["spec"] == nil { - log.Errorf("unable to retrieve pod network check spec: %v", item["spec"]) + log.Debugf("%s/invalid spec: %v", prefixErr, item["spec"]) continue } spec := item["spec"].(map[interface{}]interface{}) if item["status"] == nil { - log.Errorf("unable to retrieve pod network check status: %v", item["status"]) + log.Debugf("%s/invalid itme/status: %v", prefixErr, item) continue } status := item["status"].(map[interface{}]interface{}) @@ -131,5 +132,4 @@ func (p *MustGatherPodNetworkChecks) Parse(data map[string]interface{}) { } p.InsertCheck(check, netFailures, netOutages) } - } diff --git a/internal/openshift/mustgather/utils.go b/internal/openshift/mustgather/utils.go index 20a5ad45..52d56c16 100644 --- a/internal/openshift/mustgather/utils.go +++ b/internal/openshift/mustgather/utils.go @@ -36,10 +36,10 @@ var ( } ) -// matchToExtract define patterns to continue the must-gather processor. +// getFileTypeToProcess define patterns to continue the must-gather processor. // the pattern must be defined if the must be extracted. It will return // a boolean with match and the file group (pattern type). -func matchToExtract(path string) (bool, string) { +func getFileTypeToProcess(path string) (bool, string) { for typ, pattern := range mustGatherFilePatterns { re := regexp.MustCompile(pattern) if re.MatchString(path) { @@ -49,10 +49,10 @@ func matchToExtract(path string) (bool, string) { return false, "" } -// extractRelativePath removes the prefix of must-gather path/image to save the +// normalizeRelativePath removes the prefix of must-gather path/image to save the // relative file path when extracting the file or mapping in the counters. // OPCT collects must-gather automatically saving in the directory must-gather-opct. -func extractRelativePath(file string) string { +func normalizeRelativePath(file string) string { re := regexp.MustCompile(`must-gather-opct/([A-Za-z0-9]+(-[A-Za-z0-9]+)+\/)`) split := re.Split(file, -1) diff --git a/internal/report/baseline/baseline.go b/internal/report/baseline/baseline.go index 45d0e6ed..bc632931 100644 --- a/internal/report/baseline/baseline.go +++ b/internal/report/baseline/baseline.go @@ -80,7 +80,7 @@ func NewBaselineReportSummary() *BaselineConfig { // bucket exists. func (brs *BaselineConfig) createS3Clients() (*s3.S3, *s3manager.Uploader, error) { if !brs.checkRequiredParams() { - return nil, nil, fmt.Errorf("missing required parameters or dependencies to enable this feature. Please wait for stable release to use it") + return nil, nil, fmt.Errorf("missing required parameters or dependencies to enable this feature") } // create s3 client @@ -123,14 +123,13 @@ func (brs *BaselineConfig) ReadReportSummaryFromAPI(path string) ([]byte, error) retryClient := retryablehttp.NewClient() retryClient.RetryMax = 5 retryLogger := log.New() - retryLogger.SetLevel(log.DebugLevel) + retryLogger.SetLevel(log.WarnLevel) retryClient.Logger = retryLogger url := fmt.Sprintf("%s%s", reportBaseURL, path) req, err := http.NewRequest("GET", url, nil) if err != nil { - log.WithError(err).Error("error creating request") - return nil, err + return nil, fmt.Errorf("error creating request: %v", err) } req.Header.Set("X-Custom-Header", "opct") req.Header.Set("Content-Type", "application/json") @@ -138,8 +137,7 @@ func (brs *BaselineConfig) ReadReportSummaryFromAPI(path string) ([]byte, error) client := retryClient.StandardClient() resp, err := client.Do(req) if err != nil { - log.WithError(err).Error("error sending request") - return nil, err + return nil, fmt.Errorf("error sending request: %v", err) } defer resp.Body.Close() @@ -150,8 +148,7 @@ func (brs *BaselineConfig) ReadReportSummaryFromAPI(path string) ([]byte, error) rawResp, err := io.ReadAll(resp.Body) if err != nil { - log.WithError(err).Error("error reading response body") - return nil, err + return nil, fmt.Errorf("error reading response body: %v", err) } return rawResp, nil diff --git a/internal/report/data.go b/internal/report/data.go index 128a7202..b5aa61de 100644 --- a/internal/report/data.go +++ b/internal/report/data.go @@ -753,7 +753,6 @@ func (re *ReportData) SummaryBuilder() error { re.Provider.MustGatherInfo.NamespaceErrors = nil re.Provider.MustGatherInfo.PodNetworkChecks.Checks = nil } - // What else to clean up? return nil } diff --git a/internal/report/slo.go b/internal/report/slo.go index 9ee127e7..0a8e8876 100644 --- a/internal/report/slo.go +++ b/internal/report/slo.go @@ -124,7 +124,7 @@ func NewCheckSummary(re *ReportData) *CheckSummary { } // Cluster Checks checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-008", + ID: "OPCT-020", Name: "All nodes must be healthy", Test: func() CheckResult { res := CheckResult{Name: CheckResultNameFail, Target: "100%"} @@ -142,7 +142,7 @@ func NewCheckSummary(re *ReportData) *CheckSummary { }, }) checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-009", + ID: "OPCT-021", Name: "Pods Healthy must report higher than 98%", Test: func() CheckResult { res := CheckResult{Name: CheckResultNameFail, Target: ">=98%"} @@ -210,7 +210,7 @@ func NewCheckSummary(re *ReportData) *CheckSummary { return res } perc := (float64(p.Stat.Failed) / float64(p.Stat.Total)) * 100 - res.Actual = fmt.Sprintf("Failed==%.2f%%", perc) + res.Actual = fmt.Sprintf("Fail==%.2f%%(%d)", perc, p.Stat.Failed) if perc > 1.5 { return res } @@ -220,12 +220,13 @@ func NewCheckSummary(re *ReportData) *CheckSummary { }) checkSum.Checks = append(checkSum.Checks, &Check{ ID: "OPCT-005", - Name: "OpenShift Conformance [20-openshift-conformance-validated]: Priority must pass 99.95%", + Name: "OpenShift Conformance Validation [20]: Filter Priority Requirement >= 99.5%", Test: func() CheckResult { prefix := "Check OPCT-005 Failed" + target := 0.5 res := CheckResult{ Name: CheckResultNameFail, - Target: "Pass==100%(W<=0.5%,F>0.5%)", + Target: fmt.Sprintf("W<=%.2f%%,F>%.2f%%", target, target), Actual: "N/A", } if _, ok := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance]; !ok { @@ -242,42 +243,36 @@ func NewCheckSummary(re *ReportData) *CheckSummary { return res } perc := (float64(p.Stat.FilterFailedPrio) / float64(p.Stat.Total)) * 100 - res.Actual = fmt.Sprintf("FailedPrio==%.2f%%", perc) - if perc > 0.5 { + res.Actual = fmt.Sprintf("Fail==%.2f%%(%d)", perc, p.Stat.FilterFailedPrio) + if perc > target { res.Name = CheckResultNameFail return res } - if perc > 0 && perc <= 0.5 { - res.Name = CheckResultNameWarn - return res - } + // if perc > 0 && perc <= target { + // res.Name = CheckResultNameWarn + // return res + // } res.Name = CheckResultNamePass return res }, }) checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-TBD", - Name: "OpenShift Conformance [20-openshift-conformance-validated]: Pass 100% with Baseline", + ID: "OPCT-005B", + Name: "OpenShift Conformance Validation [20]: Required to Pass After Filtering", Test: func() CheckResult { - prefix := "Check OPCT-005 Failed" + prefix := "Check OPCT-005B Failed" + target := 0.55 res := CheckResult{ Name: CheckResultNameFail, - Target: "Pass==100%", + Target: fmt.Sprintf("Pass==100%%(W<=%.2f%%,F>%.2f%%)", target, target), Actual: "N/A", } if _, ok := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance]; !ok { return res } - if re.Baseline == nil { - res.Name = CheckResultNameSkip - return res - } - if _, ok := re.Baseline.Plugins[plugin.PluginNameOpenShiftConformance]; !ok { - res.Name = CheckResultNameSkip - return res - } // "Acceptance" are relative, the baselines is observed to set // an "accepted" value considering a healthy cluster in known provider/installation. + // plugin := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance] p := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance] if p.Stat.Total == p.Stat.Failed { res.Message = "Potential Runtime Failure. Check the Plugin logs." @@ -285,20 +280,65 @@ func NewCheckSummary(re *ReportData) *CheckSummary { log.Debugf("%s Runtime: Total and Failed counters are equals indicating execution failure", prefix) return res } - perc := (float64(p.Stat.FilterFailedPrio) / float64(p.Stat.Total)) * 100 - res.Actual = fmt.Sprintf("FailedPrio==%.2f%%", perc) - if perc > 0 { + perc := (float64(p.Stat.FilterFailures) / float64(p.Stat.Total)) * 100 + res.Actual = fmt.Sprintf("Fail==%.2f%%(%d)", perc, p.Stat.FilterFailures) + if perc > target { res.Name = CheckResultNameFail return res } + if perc > 0 && perc <= target { + res.Name = CheckResultNameWarn + return res + } res.Name = CheckResultNamePass return res }, }) + // TODO: validate if this test is duplicated with OPCT-005 + // checkSum.Checks = append(checkSum.Checks, &Check{ + // ID: "OPCT-TBD", + // Name: "OpenShift Conformance [20-openshift-conformance-validated]: Pass 100% with Baseline", + // Test: func() CheckResult { + // prefix := "Check OPCT-TBD Failed" + // res := CheckResult{ + // Name: CheckResultNameFail, + // Target: "Pass==100%", + // Actual: "N/A", + // } + // if _, ok := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance]; !ok { + // return res + // } + // if re.Baseline == nil { + // res.Name = CheckResultNameSkip + // return res + // } + // if _, ok := re.Baseline.Plugins[plugin.PluginNameOpenShiftConformance]; !ok { + // res.Name = CheckResultNameSkip + // return res + // } + // // "Acceptance" are relative, the baselines is observed to set + // // an "accepted" value considering a healthy cluster in known provider/installation. + // p := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance] + // if p.Stat.Total == p.Stat.Failed { + // res.Message = "Potential Runtime Failure. Check the Plugin logs." + // res.Actual = "Total==Failed" + // log.Debugf("%s Runtime: Total and Failed counters are equals indicating execution failure", prefix) + // return res + // } + // perc := (float64(p.Stat.FilterFailedPrio) / float64(p.Stat.Total)) * 100 + // res.Actual = fmt.Sprintf("FailedPrio==%.2f%%", perc) + // if perc > 0 { + // res.Name = CheckResultNameFail + // return res + // } + // res.Name = CheckResultNamePass + // return res + // }, + // }) checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-006", - Name: "Test suite must report a lower number of errors in logs", + ID: "OPCT-011", + Name: "The test suite should generate fewer error reports in the logs", Test: func() CheckResult { // threshold for warn and fail thWarn := 150 @@ -310,14 +350,14 @@ func NewCheckSummary(re *ReportData) *CheckSummary { } if re.Provider.ErrorCounters == nil { res.Name = CheckResultNameFail - res.Actual = "ERR missing data" + res.Actual = "ERR !counters" return res } cnt := *re.Provider.ErrorCounters if _, ok := cnt["total"]; !ok { res.Message = "Unable to load Total Counter" res.Name = CheckResultNameFail - res.Actual = "ERR missing data" + res.Actual = "ERR !total" return res } // "Acceptance" are relative, the baselines is observed to set @@ -344,8 +384,8 @@ func NewCheckSummary(re *ReportData) *CheckSummary { }, }) checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-007", - Name: "Workloads must report a lower number of alerts in logs", + ID: "OPCT-010", + Name: "The cluster logs should generate fewer error reports in the logs", Test: func() CheckResult { wantLimit := 30000 res := CheckResult{Name: CheckResultNameWarn, Target: fmt.Sprintf("<=%d", wantLimit), Actual: "N/A"} @@ -353,13 +393,13 @@ func NewCheckSummary(re *ReportData) *CheckSummary { if re.Provider.MustGatherInfo == nil { log.Debugf("%s: MustGatherInfo is not defined", prefix) res.Name = CheckResultNameFail - res.Actual = "ERR missing must-gather" + res.Actual = "ERR !must-gather" return res } if _, ok := re.Provider.MustGatherInfo.ErrorCounters["total"]; !ok { log.Debugf("%s: OPCT-007: ErrorCounters[\"total\"]", prefix) res.Name = CheckResultNameFail - res.Actual = "ERR missing data" + res.Actual = "ERR !counters" return res } // "Acceptance" are relative, the baselines is observed to set @@ -374,7 +414,7 @@ func NewCheckSummary(re *ReportData) *CheckSummary { if total == 0 { log.Debugf("%s acceptance criteria: want[!=0] got[%d]", prefix, total) res.Name = CheckResultNameFail - res.Actual = "ERR missing data" + res.Actual = "ERR total==0" return res } res.Name = CheckResultNamePass @@ -431,29 +471,37 @@ func NewCheckSummary(re *ReportData) *CheckSummary { }) */ checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-010", + ID: "OPCT-010A", Name: "etcd logs: slow requests: average should be under 500ms", Test: func() CheckResult { - prefix := "Check OPCT-010 Failed" + prefix := "Check OPCT-010A Failed" wantLimit := 500.0 - res := CheckResult{Name: CheckResultNameFail, Target: fmt.Sprintf("<=%.2f ms", wantLimit), Actual: "N/A"} + res := CheckResult{ + Name: CheckResultNameFail, + Target: fmt.Sprintf("<=%.2f ms", wantLimit), + Actual: "N/A", + } if re.Provider == nil { log.Debugf("%s: unable to read provider information.", prefix) return res } if re.Provider.MustGatherInfo == nil { + res.Actual = "ERR !must-gather" log.Debugf("%s: unable to read must-gather information.", prefix) return res } if re.Provider.MustGatherInfo.ErrorEtcdLogs == nil { + res.Actual = "ERR !logs" log.Debugf("%s: unable to etcd stat from must-gather.", prefix) return res } if re.Provider.MustGatherInfo.ErrorEtcdLogs.FilterRequestSlowAll["all"] == nil { + res.Actual = "ERR !counters" log.Debugf("%s: unable to read statistics from parsed etcd logs.", prefix) return res } if re.Provider.MustGatherInfo.ErrorEtcdLogs.FilterRequestSlowAll["all"].StatMean == "" { + res.Actual = "ERR !p50" log.Debugf("%s: unable to get p50/mean statistics from parsed data: %v", prefix, re.Provider.MustGatherInfo.ErrorEtcdLogs.FilterRequestSlowAll["all"]) return res } @@ -477,35 +525,45 @@ func NewCheckSummary(re *ReportData) *CheckSummary { }, }) checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-011", + ID: "OPCT-010B", Name: "etcd logs: slow requests: maximum should be under 1000ms", Test: func() CheckResult { - prefix := "Check OPCT-011 Failed" + prefix := "Check OPCT-010B Failed" wantLimit := 1000.0 - res := CheckResult{Name: CheckResultNameFail, Target: fmt.Sprintf("<=%.2f ms", wantLimit), Actual: "N/A"} + res := CheckResult{ + Name: CheckResultNameFail, + Target: fmt.Sprintf("<=%.2f ms", wantLimit), + Actual: "N/A", + } if re.Provider.MustGatherInfo == nil { + res.Actual = "ERR !must-gather" log.Debugf("%s: unable to read must-gather information.", prefix) return res } if re.Provider.MustGatherInfo.ErrorEtcdLogs == nil { + res.Actual = "ERR !logs" log.Debugf("%s: unable to etcd stat from must-gather.", prefix) return res } if re.Provider.MustGatherInfo.ErrorEtcdLogs.FilterRequestSlowAll["all"] == nil { + res.Actual = "ERR !counters" log.Debugf("%s: unable to read statistics from parsed etcd logs.", prefix) return res } if re.Provider.MustGatherInfo.ErrorEtcdLogs.FilterRequestSlowAll["all"].StatMax == "" { + res.Actual = "ERR !max" log.Debugf("%s: unable to get max statistics from parsed data: %v", prefix, re.Provider.MustGatherInfo.ErrorEtcdLogs.FilterRequestSlowAll["all"]) return res } values := strings.Split(re.Provider.MustGatherInfo.ErrorEtcdLogs.FilterRequestSlowAll["all"].StatMax, " ") if values[0] == "" { + res.Actual = "ERR !max" log.Debugf("%s: unable to get parse max: %v", prefix, values) return res } value, err := strconv.ParseFloat(values[0], 64) if err != nil { + res.Actual = "ERR !max" log.Debugf("%s: unable to convert max to float: %v", prefix, err) return res } @@ -519,10 +577,10 @@ func NewCheckSummary(re *ReportData) *CheckSummary { }, }) checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-012", + ID: "OPCT-022", Name: "Detected one or more plugin(s) with potential invalid result", Test: func() CheckResult { - prefix := "Check OPCT-012 Failed" + prefix := "Check OPCT-022 Failed" res := CheckResult{Name: CheckResultNameFail, Target: "passed", Actual: "N/A"} checkPlugins := []string{ @@ -553,6 +611,101 @@ func NewCheckSummary(re *ReportData) *CheckSummary { return res }, }) + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-023A", + // Should be greated than 300 + Name: "Sanity [10-openshift-kube-conformance]: potential missing tests in suite", + Test: func() CheckResult { + prefix := "Check OPCT-023A Failed" + res := CheckResult{ + Name: CheckResultNameFail, + Target: "F:<300", + Actual: "N/A", + } + if _, ok := re.Provider.Plugins[plugin.PluginNameKubernetesConformance]; !ok { + res.Actual = "ERR !plugin" + return res + } + p := re.Provider.Plugins[plugin.PluginNameKubernetesConformance] + res.Actual = fmt.Sprintf("Total==%d", p.Stat.Total) + if p.Stat.Total <= 300 { + log.Debugf("%s: found less than expected tests count=%d. Are you running in devel mode?", prefix, p.Stat.Total) + return res + } + res.Name = CheckResultNamePass + return res + }, + }) + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-023B", + // Should be greated than 3000 + Name: "Sanity [20-openshift-conformance-validated]: potential missing tests in suite", + Test: func() CheckResult { + prefix := "Check OPCT-023B Failed" + res := CheckResult{ + Name: CheckResultNameFail, + Target: "F:<3000", + Actual: "N/A", + } + if _, ok := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance]; !ok { + res.Actual = "ERR !plugin" + return res + } + p := re.Provider.Plugins[plugin.PluginNameOpenShiftConformance] + res.Actual = fmt.Sprintf("Total==%d", p.Stat.Total) + if p.Stat.Total <= 3000 { + log.Debugf("%s: found less than expected tests count=%d. Is it running in devel mode?!", prefix, p.Stat.Total) + return res + } + res.Name = CheckResultNamePass + return res + }, + }) + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-030", + Name: "Node Topology: ControlPlaneTopology HighlyAvailable must use multi-zone", + Test: func() CheckResult { + prefix := "Check OPCT-030 Failed" + res := CheckResult{ + Name: CheckResultNameFail, + Target: "W:>1,P:>2", + Actual: "N/A", + } + if re.Provider.Infra == nil { + log.Debugf("%s: missing Infrastructure object to discover ControlPlaneTopology", prefix) + res.Actual = "ERR !infra" + return res + } + if re.Provider.Infra.ControlPlaneTopology != "HighlyAvailable" { + res.Name = CheckResultNameSkip + res.Actual = fmt.Sprintf("Topology==%s", re.Provider.Infra.ControlPlaneTopology) + return res + } + // Why having 2 or less nodes in HighlyAvailable? + if len(re.Provider.Nodes) < 3 { + log.Debugf("%s: two or less control plane nodes", prefix) + res.Actual = fmt.Sprintf("Nodes==%d", len(re.Provider.Nodes)) + return res + } + controlPlaneZones := map[string]struct{}{} + for _, node := range re.Provider.Nodes { + if !node.ControlPlane { + continue + } + if zone, ok := node.Labels["topology.kubernetes.io/zone"]; ok { + controlPlaneZones[zone] = struct{}{} + } + } + if len(controlPlaneZones) < 2 { + log.Debugf("%s: found one zone: %v", prefix, controlPlaneZones) + res.Actual = fmt.Sprintf("Zones==%d", len(controlPlaneZones)) + return res + } + res.Name = CheckResultNamePass + res.Actual = fmt.Sprintf("Zones==%d", len(controlPlaneZones)) + return res + }, + }) // OpenShift / Infrastructure Object Check checkSum.Checks = append(checkSum.Checks, &Check{ ID: CheckIdEmptyValue, @@ -677,28 +830,32 @@ func NewCheckSummary(re *ReportData) *CheckSummary { }, }) // TODO(network): podConnectivityChecks must not have outages - // TODO(topology): check if cluster is multi-zone - checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-TBD", - Name: "Node Topology: ControlPlaneTopology HighlyAvailable must use multi-zone", - Test: func() CheckResult { - return CheckResult{Name: CheckResultNameSkip, Target: "TBD", Actual: "TODO"} - }, - }) - checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-TBD", - Name: "Failed tests must pass in replay step for plugin: 10-openshift-kube-conformance", - Test: func() CheckResult { - return CheckResult{Name: CheckResultNameSkip, Target: "TBD", Actual: "TODO"} - }, - }) - checkSum.Checks = append(checkSum.Checks, &Check{ - ID: "OPCT-TBD", - Name: "Failed tests must pass in replay step for plugin20-openshift-conformance-validated", - Test: func() CheckResult { - return CheckResult{Name: CheckResultNameSkip, Target: "TBD", Actual: "TODO"} - }, - }) + + // TODO: + // Question#1: Do we need this test considering there is a check of passing=100% on kube conformance? + // Question#2: is that check really need considering the final filters target 0 failures? + // checkSum.Checks = append(checkSum.Checks, &Check{ + // ID: "OPCT-TBD", + // Name: "Kubernetes Conformance [10-openshift-kube-conformance]: replay failures must-pass", + // Description: "Tests that failed in the previous run must pass in the replay step (re-run)", + // Test: func() CheckResult { + // return CheckResult{Name: CheckResultNameSkip, Target: "TBD", Actual: "TODO"} + // }, + // }) + // checkSum.Checks = append(checkSum.Checks, &Check{ + // ID: "OPCT-TBD", + // Name: "OpenShift Conformance [20-openshift-conformance-validated]: replay failures must-pass", + // Description: "Tests that failed in the previous run must pass in the replay step (re-run)", + // Test: func() CheckResult { + // // for each failed test in the Filter5, check if it passed in the replay. + // // return CheckResult{Name: CheckResultNameSkip, Target: "TBD", Actual: "TODO"} + // res := CheckResult{ + // Name: CheckResultNameFail, + // Target: "F:<300", + // Actual: "N/A", + // } + // }, + // }) // Create docs reference when ID is set for c := range checkSum.Checks { diff --git a/pkg/cmd/adm/baseline/publish.go b/pkg/cmd/adm/baseline/publish.go index b4ca6dd0..c26ba3b8 100644 --- a/pkg/cmd/adm/baseline/publish.go +++ b/pkg/cmd/adm/baseline/publish.go @@ -110,11 +110,12 @@ func baselinePublishCmdRun(cmd *cobra.Command, args []string) { // OPCT-007 (ERR missing must-gather): must-gather is missing rejected := false for _, check := range re.Checks.Fail { - if check.ID == "OPCT-012" || + if check.ID == "OPCT-001" || check.ID == "OPCT-004" || - check.ID == "OPCT-003" || - check.ID == "OPCT-006" || - check.ID == "OPCT-007" { + check.ID == "OPCT-005" || + check.ID == "OPCT-022" || + check.ID == "OPCT-023A" || + check.ID == "OPCT-023B" { errMessage := fmt.Sprintf("%q: want=%q, got=%q", check.SLO, check.SLITarget, check.SLIActual) if check.Message != "" { errMessage = fmt.Sprintf("%s: message=%q", errMessage, check.Message) diff --git a/pkg/cmd/report/report.go b/pkg/cmd/report/report.go index b5df0043..52b134f8 100644 --- a/pkg/cmd/report/report.go +++ b/pkg/cmd/report/report.go @@ -5,6 +5,7 @@ import ( "net/http" "os" "path/filepath" + "sort" "github.com/pkg/errors" "github.com/spf13/cobra" @@ -490,7 +491,9 @@ func showProcessedSummary(re *report.ReportData) error { fmt.Printf("\n=> Processed Summary <=\n") fmt.Printf("==> Result Summary by test suite:\n") bProcessed := re.Provider.HasValidBaseline - for _, pluginName := range re.Provider.GetPlugins() { + plugins := re.Provider.GetPlugins() + sort.Strings(plugins) + for _, pluginName := range plugins { showSummaryPlugin(re.Provider, pluginName, bProcessed) } return nil @@ -514,7 +517,7 @@ func showSummaryPlugin(re *report.ReportResult, pluginName string, bProcessed bo titleIcon := "" tb.SetColumnConfigs([]table.ColumnConfig{ {Number: 1, WidthMin: 25, WidthMax: 25}, - {Number: 2, WidthMin: 10, WidthMax: 10}, + {Number: 2, WidthMin: 13, WidthMax: 13}, }) rows := []table.Row{} @@ -538,12 +541,12 @@ func showSummaryPlugin(re *report.ReportResult, pluginName string, bProcessed bo renderTable() return } - rows = append(rows, table.Row{"Filter1 Failed Suite", plugin.UtilsCalcPercStr(stat.FilterSuite, stat.Total)}) - rows = append(rows, table.Row{"Filter2 Failed Baseline", plugin.UtilsCalcPercStr(stat.FilterBaseline, stat.Total)}) - rows = append(rows, table.Row{"Filter3 Failed Priority", plugin.UtilsCalcPercStr(stat.FilterFailedPrio, stat.Total)}) - rows = append(rows, table.Row{"Filter4 Failed API", plugin.UtilsCalcPercStr(stat.FilterFailedAPI, stat.Total)}) - rows = append(rows, table.Row{"Filter5 Failed KF", plugin.UtilsCalcPercStr(stat.Filter5Failures, stat.Total)}) - rows = append(rows, table.Row{"Filter6 Replay", plugin.UtilsCalcPercStr(stat.Filter6Failures, stat.Total)}) + rows = append(rows, table.Row{"Filter Failed Suite", plugin.UtilsCalcPercStr(stat.FilterSuite, stat.Total)}) + rows = append(rows, table.Row{"Filter Failed KF", plugin.UtilsCalcPercStr(stat.Filter5Failures, stat.Total)}) + rows = append(rows, table.Row{"Filter Replay", plugin.UtilsCalcPercStr(stat.Filter6Failures, stat.Total)}) + rows = append(rows, table.Row{"Filter Failed Baseline", plugin.UtilsCalcPercStr(stat.FilterBaseline, stat.Total)}) + rows = append(rows, table.Row{"Filter Failed Priority", plugin.UtilsCalcPercStr(stat.FilterFailedPrio, stat.Total)}) + rows = append(rows, table.Row{"Filter Failed API", plugin.UtilsCalcPercStr(stat.FilterFailedAPI, stat.Total)}) rows = append(rows, table.Row{"Failures (Priotity)", plugin.UtilsCalcPercStr(stat.FilterFailures, stat.Total)}) // TODO(mtulio): review suites provides better signal. diff --git a/pkg/run/manifests.go b/pkg/run/manifests.go index 34bf6b4b..6162051e 100644 --- a/pkg/run/manifests.go +++ b/pkg/run/manifests.go @@ -36,18 +36,20 @@ func loadPluginManifests(r *RunOptions) ([]*manifest.Manifest, error) { return nil, err } for _, m := range pluginManifests { - log.Debugf("Loading certification plugin: %s", m) + log.Debugf("Loading plugin: %s", m) pluginManifestTpl, err := efs.GetData().ReadFile(m) if err != nil { - log.Errorf("Unable to read plugin manifest %s", m) + log.Errorf("error reading config for plugin %s: %v", m, err) return nil, err } pluginManifest, err := ProcessManifestTemplates(r, pluginManifestTpl) if err != nil { + log.Errorf("error processing configuration for plugin %s: %v", m, err) return nil, err } asset, err := loader.LoadDefinition(pluginManifest) if err != nil { + log.Errorf("error loading configuration for plugin %s: %v", m, err) return nil, err } manifests = append(manifests, &asset) diff --git a/pkg/run/run.go b/pkg/run/run.go index eda5a85b..055abeac 100644 --- a/pkg/run/run.go +++ b/pkg/run/run.go @@ -37,9 +37,8 @@ import ( type RunOptions struct { plugins *[]string - sonobuoyImage string - imageRepository string - openshiftTestsImage string + sonobuoyImage string + imageRepository string // PluginsImage // defines the image containing plugins associated with the provider-certification-tool. @@ -47,6 +46,7 @@ type RunOptions struct { PluginsImage string CollectorImage string MustGatherMonitoringImage string + OpenshiftTestsImage string timeout int watch bool @@ -114,16 +114,15 @@ func NewCmdRun() *cobra.Command { log.Info("Running OPCT...") // Fire off sonobuoy - err := o.Run(kclient, sclient) - if err != nil { - log.WithError(err).Fatal("Error running the tool. Please check the errors and try again.") + if err := o.Run(kclient, sclient); err != nil { + // log.WithError(err).Fatal("Error running the tool. Please check the errors and try again.") + log.Fatal("unable to run the tool, review the errors and try again: %w", err) } log.Info("Jobs scheduled! Waiting for resources be created...") // Wait for Sonobuoy to create - err = wait.WaitForRequiredResources(kclient) - if err != nil { + if err = wait.WaitForRequiredResources(kclient); err != nil { log.WithError(err).Fatal("error waiting for sonobuoy pods to become ready") } @@ -173,12 +172,12 @@ func NewCmdRun() *cobra.Command { // devel can be override by quay.io/opct/openshift-tests:devel // opct run --devel-skip-checks=true --plugins-image=plugin-openshift-tests:v0.0.0-devel-8ff93d9 --devel-tests-image=quay.io/opct/openshift-tests:devel - cmd.Flags().StringVar(&o.openshiftTestsImage, "openshift-tests-image", pkg.OpenShiftTestsImage, "Developer Mode only: openshift-tests image override") + cmd.Flags().StringVar(&o.OpenshiftTestsImage, "openshift-tests-image", pkg.OpenShiftTestsImage, "Developer Mode only: openshift-tests image override") // Hide optional flags hideOptionalFlags(cmd, "dedicated") - hideOptionalFlags(cmd, "devel-limit-tests") - hideOptionalFlags(cmd, "devel-skip-checks") + // hideOptionalFlags(cmd, "devel-limit-tests") + // hideOptionalFlags(cmd, "devel-skip-checks") hideOptionalFlags(cmd, "sonobuoy-image") hideOptionalFlags(cmd, "plugins-image") @@ -554,12 +553,11 @@ func (r *RunOptions) Run(kclient kubernetes.Interface, sclient sonobuoyclient.In } if r.plugins == nil || len(*r.plugins) == 0 { - // Use default built-in plugins log.Debugf("Loading default plugins") var err error manifests, err = loadPluginManifests(r) if err != nil { - return nil + return err } } else { // User provided their own plugins at command line diff --git a/pkg/types.go b/pkg/types.go index dbcdab45..7b208884 100644 --- a/pkg/types.go +++ b/pkg/types.go @@ -19,10 +19,10 @@ const ( SonobuoyLabelComponentName = "component" SonobuoyLabelComponentValue = "sonobuoy" DefaultToolsRepository = "quay.io/opct" - PluginsImage = "plugin-openshift-tests:v0.5.0-alpha.5" - CollectorImage = "plugin-artifacts-collector:v0.5.0-alpha.5" + PluginsImage = "plugin-openshift-tests:v0.0.0-devel-d4745f8" + CollectorImage = "plugin-artifacts-collector:v0.0.0-devel-d4745f8" + MustGatherMonitoringImage = "must-gather-monitoring:v0.0.0-devel-d4745f8" OpenShiftTestsImage = "image-registry.openshift-image-registry.svc:5000/openshift/tests" - MustGatherMonitoringImage = "must-gather-monitoring:v0.5.0-alpha.4" ) var ( @@ -43,17 +43,17 @@ var ( ) func GetSonobuoyImage() string { - return DefaultToolsRepository + SonobuoyImage + return fmt.Sprintf("%s/%s", DefaultToolsRepository, SonobuoyImage) } func GetPluginsImage() string { - return DefaultToolsRepository + PluginsImage + return fmt.Sprintf("%s/%s", DefaultToolsRepository, PluginsImage) } func GetCollectorImage() string { - return DefaultToolsRepository + CollectorImage + return fmt.Sprintf("%s/%s", DefaultToolsRepository, CollectorImage) } func GetMustGatherMonitoring() string { - return DefaultToolsRepository + MustGatherMonitoringImage + return fmt.Sprintf("%s/%s", DefaultToolsRepository, MustGatherMonitoringImage) }