diff --git a/data/templates/report/report.html b/data/templates/report/report.html index 0b9b25a6..9500faef 100644 --- a/data/templates/report/report.html +++ b/data/templates/report/report.html @@ -530,18 +530,35 @@

Summary

this.menuBody = "> " + this.pageHeadline this.menuBody += "

Presubmit checks extracted from OPCT results." + // build hyperlinks in the name (when exists) + dtFailures = [] + for (let check of this.report.checks.failures) { + if (check.reference != "" && check.patched == undefined) { + check.patched = true + check.name = ""+ check.name +"
" + } + dtFailures.push(check) + } let tbFailures = { - header: "Check Failures (must be fixed) ("+ this.report.checks.failures.length +")", - data: this.report.checks.failures, + header: "Failed Checks [must be fixed] ("+ dtFailures.length +")", + data: dtFailures, headline: "", fields: fields=["name","result"], fieldMap: {} } this.menuBody += this.createTableHTML(table=tbFailures); + dtSuccess = [] + for (let check of this.report.checks.successes) { + if (check.reference != "" && check.patched == undefined) { + check.patched = true + check.name = ""+ check.name +"
" + } + dtSuccess.push(check) + } let tSucc = { - header: "Passed checks ("+ this.report.checks.successes.length +")", - data: this.report.checks.successes, + header: "Passed checks ("+ dtSuccess.length +")", + data: dtSuccess, headline: "", fields: fields=["name","result"], fieldMap: {} diff --git a/docs/README.md b/docs/README.md index f8891d97..a60ce948 100644 --- a/docs/README.md +++ b/docs/README.md @@ -9,6 +9,6 @@ Here you can find the initial steps to use the tool: - [User Guide](./user.md) - [Installation Check List](./user-installation-checklist.md) - [Installation Review](./user-installation-review.md) - - [Results Review](./user-results-review.md) + - [Support Guide](./support-guide.md) - [Development Guide](./dev.md) \ No newline at end of file diff --git a/docs/review/index.md b/docs/review/index.md new file mode 100644 index 00000000..dff05559 --- /dev/null +++ b/docs/review/index.md @@ -0,0 +1,6 @@ +# Review Guides + +OPCT provides a set of documents and guides to explore the results, +and common issues in OpenShift clusters. + +- [OPCT Review Rules](./rules.md): Acceptance rules used by `OPCT` in the `report` command when processing the results. \ No newline at end of file diff --git a/docs/review/rules.md b/docs/review/rules.md new file mode 100644 index 00000000..cdbb95b1 --- /dev/null +++ b/docs/review/rules.md @@ -0,0 +1,263 @@ +# OPCT Review/Check Rules + +The OPCT rules are used in the `report` command to evaluate the data collected by +the OPCT execution. The HTML report will link directly to the rule ID on this page. + +The rule details can be used as an additional resource in the review process. + +The acceptance criteria for the rules are based on the CI results. + +## Rules +___ +### OPCT-001 + +- **Name**: Plugin Conformance Kubernetes [10-openshift-kube-conformance] must pass (after filters) +- **Description**: Kubernetes Conformance suite (defined as `kubernetes/conformance` in `openshift-tests`) implements e2e required by Kubernetes Certification. + +Expected: +``` + - 10-openshift-kube-conformance: +... + - Failed (Filter SuiteOnly): 0 (0.00%) + - Failed (Priority) : 0 (0.00%) + - Status After Filters : passed +``` + +- **Troubleshooting**: + +Review the High-Priority Failures: +```sh +$ /opct-dev report archive.tar.gz +(..) + => 10-openshift-kube-conformance: (2 failures, 0 flakes) + + --> Failed tests to Review (without flakes) - Immediate action: +[total=2] [sig-apps=1 (50.00%)] [sig-api-machinery=1 (50.00%)] + +15 [sig-apps] Deployment deployment should support proportional scaling [Conformance] [Suite:openshift/conformance/parallel/minimal] [Suite:k8s] +6 [sig-api-machinery] Aggregator Should be able to support the 1.17 Sample API Server using the current Aggregator [Conformance] [Suite:openshift/conformance/parallel/minimal] [Suite:k8s] + + --> Failed flake tests - Statistic from OpenShift CI +[total=0] + +Flakes Perc ErrCount TestName + +``` +___ +### OPCT-002 + +- **Name**: Plugin Conformance Upgrade [05-openshift-cluster-upgrade] must pass +- **Description**: The upgrade conformance suite runs e2e tests while running upgrade using `openshift-tests` tool. The overall result must be passed. +___ +### OPCT-003 + +- **Name**: Plugin Collector [99-openshift-artifacts-collector] must pass. +- **Description**: The Collector plugin is responsible to retrieve information from the cluster, including must-gather, etcd parsed logs, e2e test lists for conformance suites. It is expected the value of `passed` in the state, otherwise, the review flow will be impacted. +- **Troubleshooting**: + +Check the failed tests: +```sh +$ ./opct results -p 99-openshift-artifacts-collector archive.tar.gz +``` + +Check the plugin logs: +```sh +$ grep -B 5 'Creating failed JUnit' \ + podlogs/openshift-provider-certification/sonobuoy-99-*/logs/plugin.txt +``` +___ +### OPCT-004 + +- **Name**: OpenShift Conformance [20-openshift-conformance-validated]: Failed tests must report less than 1.5% +- **Description**: OpenShift Conformance suite must not report a high number of failures in the base execution. Ideally, the lower is better, but the e2e tests are frequently being updated/improved fixing bugs and eventually, the tested release could be impacted by those issues. The reference of 1.5% baseline is from executions in known platforms. Higher failures could be related to errors in the tested environment. Check the test logs to isolate the issues. +- **Action**: Check the failures section `Test failures [high priority]` + +___ +### OPCT-005 +- **Name**: OpenShift Conformance [20-openshift-conformance-validated]: Priority must report less than 0.5% +- **Description**: OpenShift Conformance suite must not report a high number of failures after applying filters. Ideally, the lower is better, but the e2e tests are frequently being updated/improved fixing bugs and eventually, the tested release could be impacted by those issues. The reference of 0.5% baseline is from executions in known platforms. Higher failures could be related to errors in the tested environment. Check the test logs to isolate the issues. +- **Action**: Check the failures section `Test failures [high priority]` + +___ +### OPCT-006 +- **Name**: Suite Errors must report a lower number of log errors +- **Description**: The Conformance suites are reporting a high number of errors. +- **Action**: Check test logs to isolate the errors. +- **Troubleshooting**: + +To check the error counter by e2e test using HTML report navigate to `Suite Errors` in the left menu and table `Tests by Error Pattern`. + +To check the logs, navigate to the Plugin menu and check the logs `failure` and `systemOut`. + +___ +### OPCT-007 +- **Name**: Workloads must report a lower number of errors in the logs +- **Description**: Workloads collected are reporting a high number of errors. +- **Action**: Check pod logs to isolate the issue. +- **Troubleshooting**: + +To check the error counter by test using HTML report navigate to `Workload Errors` in the left menu. The table `Error Counters by Namespace` shows the namespace reporting a high number of errors, rank by the higher, you can start exploring the logs in that namespace. + +The table `Error Counters by Pod and Pattern` in `Workload Errors` menu also report the pods +you also can use that information to isolate any issue in your environment. + +To explore the logs, you can extract the must-gather collected by plugin `99-openshift-artifacts-collector`: + +```sh +# extract must-gather from the results +tar xfz artifact.tar.gz \ + plugins/99-openshift-artifacts-collector/results/global/artifacts_must-gather.tar.xz + +# extract must-gather +mkdir must-gather && \ + tar xfJ plugins/99-openshift-artifacts-collector/results/global/artifacts_must-gather.tar.xz \ + -C must-gather + +# check workload logs with omc (example etcd) +omc use must-gather +omc logs -n openshift-etcd etcd-control-plane-0 -c etcd +``` +___ +### OPCT-008 +- **Name**: All nodes must be healthy +- **Description**: All nodes in the cluster must be ready. +- **Action**: Check the nodes and the reason it is not reporting as ready. +- **Troubleshooting**: + +Check the unhealthy nodes in the cluster: +```sh +$ omc get nodes +``` + +Review the node and events: +```sh +$ omc describe node +``` + +___ +### OPCT-009 +- **Name**: Pods Healthy must report be higher than 98% +- **Description**: Pods must report healthy. +- **Action**: Check the failing pod, isolate if it is related with the environment and/or the validation tests. +- **Troubleshooting**: + +Check the unhealthy pods: +```sh +$ ./opct report archive.tar.gz +(...) + Health summary: [A=True/P=True/D=True] + - Cluster Operators : [33/0/0] + - Node health : 6/6 (100.00%) + - Pods health : 246/247 (99.00%) + + Failed pods: + Namespace/PodName Healthy Ready Reason Message + openshift-kube-controller-manager/installer-6-control-plane-1 false False PodFailed +(...) +``` + +Explore the pods: +```sh +$ omc get pods -A |egrep -v '(Running|Completed)' +``` +___ + +___ +### OPCT-007 +- **Name**: Workloads must report a lower number of errors in the logs +- **Description**: Workloads collected are reporting a high number of errors. +- **Action**: Check pod logs to isolate the issue. +- **Troubleshooting**: + +To check the error counter by e2e test using HTML report navigate to `Workload Errors` in the left menu. The table `Error Counters by Namespace` shows the namespace reporting a high number of errors, rank by the higher, you can start exploring the logs in that namespace. + +The table `Error Counters by Pod and Pattern` in `Workload Errors` menu also report the pods +you also can use that information to isolate any issue in your environment. + +To explore the logs, you can extract the must-gather collected by the plugin `99-openshift-artifacts-collector`: + +```sh +# extract must-gather from the results +tar xfz artifact.tar.gz \ + plugins/99-openshift-artifacts-collector/results/global/artifacts_must-gather.tar.xz + +# extract must-gather +mkdir must-gather && \ + tar xfJ plugins/99-openshift-artifacts-collector/results/global/artifacts_must-gather.tar.xz \ + -C must-gather + +# check workload logs with `omc` (example etcd) +omc use must-gather +omc logs -n openshift-etcd etcd-control-plane-0 -c etcd +``` +___ +### OPCT-008 +- **Name**: All nodes must be healthy +- **Description**: All nodes in the cluster must be ready. +- **Action**: Check the nodes and the reason it is not reporting as ready. +- **Troubleshooting**: + +Check the unhealthy nodes in the cluster: +```sh +$ omc get nodes +``` + +Review the node and events: +```sh +$ omc describe node +``` + +___ +### OPCT-009 +- **Name**: Pods Healthy must report higher than 98% +- **Description**: Pods must report healthy. +- **Action**: Check the failing pod, and isolate if it is related to the environment and/or the validation tests. +- **Troubleshooting**: + +Check the unhealthy pods: +```sh +$ ./opct report archive.tar.gz +(...) + Health summary: [A=True/P=True/D=True] + - Cluster Operators : [33/0/0] + - Node health : 6/6 (100.00%) + - Pods health : 246/247 (99.00%) + + Failed pods: + Namespace/PodName Healthy Ready Reason Message + openshift-kube-controller-manager/installer-6-control-plane-1 false False PodFailed +(...) +``` + +Explore the pods: +```sh +$ omc get pods -A |egrep -v '(Running|Completed)' +``` +___ + \ No newline at end of file diff --git a/internal/pkg/summary/checks.go b/internal/pkg/summary/checks.go index ab9dfa5d..a2bf80b6 100644 --- a/internal/pkg/summary/checks.go +++ b/internal/pkg/summary/checks.go @@ -10,18 +10,36 @@ Existing Checks: */ package summary -import "fmt" +import ( + "fmt" + "os" +) + +const ( + docsRulesPath = "/review/rules" + defaultBaseURL = "https://redhat-openshift-ecosystem.github.io/provider-certification-tool" +) type CheckSummary struct { - Checks []*Check `json:"checks"` + baseURL string + Checks []*Check `json:"checks"` } func NewCheckSummary(re *Report) *CheckSummary { - checks := &CheckSummary{ - Checks: []*Check{}, + + baseURL := defaultBaseURL + // mkdocs serve + // export OPCT_DEV_BASE_URL_DOC="http://127.0.0.1:8000/provider-certification-tool" + localDevBaseURL := os.Getenv("OPCT_DEV_BASE_URL_DOC") + if localDevBaseURL != "" { + baseURL = localDevBaseURL + } + checkSum := &CheckSummary{ + Checks: []*Check{}, + baseURL: fmt.Sprintf("%s%s", baseURL, docsRulesPath), } // OpenShift / Infrastructure Object Check - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "Platform Type should be None", Test: func() CheckResult { if re.Provider == nil || re.Provider.Infra == nil { @@ -35,7 +53,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "Cluster Version Operator must be Available", Test: func() CheckResult { if re.Provider == nil || re.Provider.Version == nil || re.Provider.Version.OpenShift == nil { @@ -47,7 +65,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "Cluster condition Failing must be False", Test: func() CheckResult { if re.Provider == nil || re.Provider.Version == nil || re.Provider.Version.OpenShift == nil { @@ -59,7 +77,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "Cluster upgrade must not be Progressing", Test: func() CheckResult { if re.Provider == nil || re.Provider.Version == nil || re.Provider.Version.OpenShift == nil { @@ -71,7 +89,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "Cluster ReleaseAccepted must be True", Test: func() CheckResult { if re.Provider == nil || re.Provider.Version == nil || re.Provider.Version.OpenShift == nil { @@ -83,7 +101,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "Infrastructure status must have Topology=HighlyAvailable", Test: func() CheckResult { if re.Provider == nil || re.Provider.Infra == nil { @@ -95,7 +113,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "Infrastructure status must have ControlPlaneTopology=HighlyAvailable", Test: func() CheckResult { if re.Provider == nil || re.Provider.Infra == nil { @@ -107,22 +125,22 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-009", Name: "All nodes must be healthy", Test: func() CheckResult { if re.Provider == nil || re.Provider.ClusterHealth == nil { - // return CheckRespCustomFail("unable to check Provider's ClusterHealth") return CheckResultFail } if re.Provider.ClusterHealth.NodeHealthPerc != 100 { - // return CheckRespCustomFail(fmt.Sprintf("NodeHealthTotal==%d", re.Provider.ClusterHealth.NodeHealthTotal)) return CheckResultFail } return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ - Name: "Pods Healthy must report be higher than 98%", + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-009", + Name: "Pods Healthy must report higher than 98%", Test: func() CheckResult { if re.Provider == nil || re.Provider.ClusterHealth == nil { return CheckResultFail @@ -134,20 +152,21 @@ func NewCheckSummary(re *Report) *CheckSummary { }, }) // Plugins Checks - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ ID: "OPCT-001", Name: "Plugin Conformance Kubernetes [10-openshift-kube-conformance] must pass (after filters)", Test: func() CheckResult { if _, ok := re.Provider.Plugins[PluginNameKubernetesConformance]; !ok { return CheckResultFail } - if re.Provider.Plugins[PluginNameKubernetesConformance].CountFilterSuite > 0 { + fmt.Println(len(re.Provider.Plugins[PluginNameKubernetesConformance].TestsFailedPrio)) + if len(re.Provider.Plugins[PluginNameKubernetesConformance].TestsFailedPrio) > 0 { return CheckResultFail } return CheckResultPass }, }) - // checks.Checks = append(checks.Checks, &Check{ + // checkSum.Checks = append(checkSum.Checks, &Check{ // Name: "OpenShift Conformance plugin 20-openshift-conformance-validated", // Test: func() CheckResult { // if _, ok := re.Provider.Plugins[PluginNameOpenShiftConformance]; !ok { @@ -162,7 +181,8 @@ func NewCheckSummary(re *Report) *CheckSummary { // return CheckResultPass // }, // }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-004", Name: "OpenShift Conformance [20-openshift-conformance-validated]: Failed tests must report less than 1.5%", Test: func() CheckResult { if _, ok := re.Provider.Plugins[PluginNameOpenShiftConformance]; !ok { @@ -178,7 +198,8 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-005", Name: "OpenShift Conformance [20-openshift-conformance-validated]: Priority must report less than 0.5%", Test: func() CheckResult { if _, ok := re.Provider.Plugins[PluginNameOpenShiftConformance]; !ok { @@ -194,7 +215,8 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-006", Name: "Suite Errors must report a lower number of log errors", Test: func() CheckResult { if re.Provider.ErrorCounters == nil { @@ -217,7 +239,8 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ + ID: "OPCT-007", Name: "Workloads must report a lower number of errors in the logs", Test: func() CheckResult { if re.Provider.MustGatherInfo == nil { @@ -239,7 +262,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ ID: "OPCT-003", Name: "Plugin Collector [99-openshift-artifacts-collector] must pass", Test: func() CheckResult { @@ -252,7 +275,7 @@ func NewCheckSummary(re *Report) *CheckSummary { return CheckResultPass }, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ ID: "OPCT-002", Name: "Plugin Conformance Upgrade [05-openshift-cluster-upgrade] must pass", Test: func() CheckResult { @@ -267,24 +290,31 @@ func NewCheckSummary(re *Report) *CheckSummary { }) // TODO(etcd) /* - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "[TODO] etcd fio must accept the tests (TODO)", Test: AcceptanceCheckFail, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "[TODO] etcd slow requests: average must be under 500ms", Test: ExampleAcceptanceCheckPass, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "[TODO] etcd slow requests: p99 must be lower than 900ms", Test: AcceptanceCheckFail, }) - checks.Checks = append(checks.Checks, &Check{ + checkSum.Checks = append(checkSum.Checks, &Check{ Name: "[TODO] etcd slow requests: must nto have requests taking more than 1s", Test: ExampleAcceptanceCheckPass, }) */ - return checks + + // Create docs reference when ID is set + for c := range checkSum.Checks { + if checkSum.Checks[c].ID != "" { + checkSum.Checks[c].Reference = fmt.Sprintf("%s/#%s", checkSum.baseURL, checkSum.Checks[c].ID) + } + } + return checkSum } func (csum *CheckSummary) GetChecksFailed() []*Check { diff --git a/internal/pkg/summary/checks_test.go b/internal/pkg/summary/checks_test.go index 80152daa..b2111f5e 100644 --- a/internal/pkg/summary/checks_test.go +++ b/internal/pkg/summary/checks_test.go @@ -1,5 +1,7 @@ package summary // TODO: create validation for -// - name should not be more chars than X +// - name should not have more than X size +// - ID must be in the format OPCT-NNN +// - DOC reference must exists in docs/review/rules.md // - returns should be pass or fail diff --git a/internal/pkg/summary/report.go b/internal/pkg/summary/report.go index 314347e8..dd5ff52f 100644 --- a/internal/pkg/summary/report.go +++ b/internal/pkg/summary/report.go @@ -113,21 +113,21 @@ type ReportClusterHealth struct { } type ReportPlugin struct { - ID string `json:"id"` - Title string `json:"title"` - Name string `json:"name"` - Definition *PluginDefinition `json:"definition,omitempty"` - Stat *ReportPluginStat `json:"stat"` - ErrorCounters *ErrorCounter `json:"errorCounters,omitempty"` - CountFilterSuite uint64 `json:"countFilterSuite"` - CountFilterBase uint64 `json:"countFilterBase"` - CountFilterPrio uint64 `json:"countFilterFilterPrio"` - Suite *OpenshiftTestsSuite `json:"suite"` - TagsFailedPrio string `json:"tagsFailuresPriority"` - TestsFailedPrio []*ReportTestFailure `json:"testsFailuresPriority"` - TagsFlakeCI string `json:"tagsFlakeCI"` - TestsFlakeCI []*ReportTestFailure `json:"testsFlakeCI"` - Tests map[string]*TestItem `json:"tests,omitempty"` + ID string `json:"id"` + Title string `json:"title"` + Name string `json:"name"` + Definition *PluginDefinition `json:"definition,omitempty"` + Stat *ReportPluginStat `json:"stat"` + ErrorCounters *ErrorCounter `json:"errorCounters,omitempty"` + // CountFilterSuite uint64 `json:"countFilterSuite"` + // CountFilterBase uint64 `json:"countFilterBase"` + // CountFilterPrio uint64 `json:"countFilterFilterPrio"` + Suite *OpenshiftTestsSuite `json:"suite"` + TagsFailedPrio string `json:"tagsFailuresPriority"` + TestsFailedPrio []*ReportTestFailure `json:"testsFailuresPriority"` + TagsFlakeCI string `json:"tagsFlakeCI"` + TestsFlakeCI []*ReportTestFailure `json:"testsFlakeCI"` + Tests map[string]*TestItem `json:"tests,omitempty"` } type ReportPluginStat struct {