From 3282c039b517da6e0bab816c93a6135209c574f8 Mon Sep 17 00:00:00 2001 From: Saswata Mukherjee Date: Thu, 3 Jun 2021 20:00:56 +0530 Subject: [PATCH] Add abstractions for validator Signed-off-by: Saswata Mukherjee --- main.go | 2 +- pkg/extflag/pathorcontent.go | 67 ++++--- pkg/mdformatter/linktransformer/config.go | 134 ++++++++++++-- pkg/mdformatter/linktransformer/link.go | 18 +- pkg/mdformatter/linktransformer/link_test.go | 14 +- pkg/mdformatter/linktransformer/validator.go | 173 ++++++------------- 6 files changed, 241 insertions(+), 167 deletions(-) diff --git a/main.go b/main.go index 39e185f..606de27 100644 --- a/main.go +++ b/main.go @@ -127,7 +127,7 @@ This directive runs executable with arguments and put its stderr and stdout outp "Absolute path links will be converted to relative links to anchor dir as well.").Regexp() // TODO(bwplotka): Add cache in file? linksValidateEnabled := cmd.Flag("links.validate", "If true, all links will be validated").Short('l').Bool() - linksValidateConfig := extflag.RegisterPathOrContent(cmd, "links.validate.config", "YAML file for skipping link check, with spec defined in github.com/bwplotka/mdox/pkg/linktransformer.Config", false, true) + linksValidateConfig := extflag.RegisterPathOrContent(cmd, "links.validate.config", "YAML file for skipping link check, with spec defined in github.com/bwplotka/mdox/pkg/linktransformer.Config", extflag.WithEnvSubstitution(true)) cmd.Run(func(ctx context.Context, logger log.Logger) (err error) { var opts []mdformatter.Option diff --git a/pkg/extflag/pathorcontent.go b/pkg/extflag/pathorcontent.go index db73b10..79fff3f 100644 --- a/pkg/extflag/pathorcontent.go +++ b/pkg/extflag/pathorcontent.go @@ -12,7 +12,6 @@ import ( "io/ioutil" "os" "regexp" - "strings" "github.com/pkg/errors" "gopkg.in/alecthomas/kingpin.v2" @@ -29,12 +28,15 @@ type PathOrContent struct { content *string } +// Option is a functional option type for PathOrContent objects. +type Option func(*PathOrContent) + type FlagClause interface { Flag(name, help string) *kingpin.FlagClause } // RegisterPathOrContent registers PathOrContent flag in kingpinCmdClause. -func RegisterPathOrContent(cmd FlagClause, flagName string, help string, required bool, envSubstitution bool) *PathOrContent { +func RegisterPathOrContent(cmd FlagClause, flagName string, help string, opts ...Option) *PathOrContent { fileFlagName := fmt.Sprintf("%s-file", flagName) contentFlagName := flagName @@ -44,13 +46,17 @@ func RegisterPathOrContent(cmd FlagClause, flagName string, help string, require contentHelp := fmt.Sprintf("Alternative to '%s' flag (mutually exclusive). Content of %s", fileFlagName, help) contentFlag := cmd.Flag(contentFlagName, contentHelp).PlaceHolder("").String() - return &PathOrContent{ + p := &PathOrContent{ flagName: flagName, - required: required, path: fileFlag, content: contentFlag, - envSubstitution: envSubstitution, + required: false, + envSubstitution: false, + } + for _, opt := range opts { + opt(p) } + return p } // Content returns the content of the file when given or directly the content that has been passed to the flag. @@ -80,24 +86,45 @@ func (p *PathOrContent) Content() ([]byte, error) { return nil, errors.Errorf("flag %s or %s is required for running this command and content cannot be empty.", fileFlagName, p.flagName) } if p.envSubstitution { - content = substituteEnvVars(string(content)) + replace, err := expandEnv(content) + if err != nil { + return nil, err + } + content = replace } return content, nil } -// substituteEnvVars returns content of YAML file with substituted environment variables. -// Will be substituted with empty string if env var isn't set. -// Follows K8s convention, i.e $(...), as mentioned here https://kubernetes.io/docs/tasks/inject-data-application/define-interdependent-environment-variables/. -func substituteEnvVars(content string) []byte { - var replaceWithEnv []string - // Match env variable syntax. - envVarName := regexp.MustCompile(`\$\((?P[a-zA-Z_]+[a-zA-Z0-9_]*)\)`) - loc := envVarName.FindAllStringSubmatchIndex(content, -1) - for i := range loc { - // Add pair to be replaced. - replaceWithEnv = append(replaceWithEnv, content[loc[i][0]:loc[i][1]], os.Getenv(content[loc[i][2]:loc[i][3]])) +// WithRequired allows you to override default required option. +func WithRequired(r bool) Option { + return func(p *PathOrContent) { + p.required = r + } +} + +// WithRequired allows you to override default envSubstitution option. +func WithEnvSubstitution(e bool) Option { + return func(p *PathOrContent) { + p.envSubstitution = e } - replacer := strings.NewReplacer(replaceWithEnv...) - contentWithEnv := replacer.Replace(content) - return []byte(contentWithEnv) +} + +// expandEnv returns content of YAML file with substituted environment variables. +// Follows K8s convention, i.e $(...), as mentioned here https://kubernetes.io/docs/tasks/inject-data-application/define-interdependent-environment-variables/. +func expandEnv(b []byte) (r []byte, err error) { + var envRe = regexp.MustCompile(`\$\(([a-zA-Z_0-9]+)\)`) + r = envRe.ReplaceAllFunc(b, func(n []byte) []byte { + if err != nil { + return nil + } + n = n[2 : len(n)-1] + + v, ok := os.LookupEnv(string(n)) + if !ok { + err = errors.Errorf("found reference to unset environment variable %q", n) + return nil + } + return []byte(v) + }) + return r, err } diff --git a/pkg/mdformatter/linktransformer/config.go b/pkg/mdformatter/linktransformer/config.go index debacfd..78991a5 100644 --- a/pkg/mdformatter/linktransformer/config.go +++ b/pkg/mdformatter/linktransformer/config.go @@ -5,35 +5,65 @@ package linktransformer import ( "bytes" + "encoding/json" + "fmt" + "math" + "net/http" "regexp" + "strconv" "github.com/pkg/errors" "gopkg.in/yaml.v3" ) -const roundtrip ValidatorType = "roundtrip" - -// const github ValidatorType = "github" - type Config struct { Version int - Validators []Validator `yaml:"validators"` + Validators []ValidatorConfig `yaml:"validators"` } -type Validator struct { - _regex *regexp.Regexp - _maxNum int +type ValidatorConfig struct { // Regex for type github is reponame matcher, like `bwplotka\/mdox`. Regex string `yaml:"regex"` - // By default type is `roundtrip`. Could be `github`. + // By default type is `ignore`. Could be `github` or `roundtrip`. Type ValidatorType `yaml:"type"` // GitHub repo token to avoid getting rate limited. Token string `yaml:"token"` + + ghValidator GitHubValidator + rtValidator RoundTripValidator + igValidator IgnoreValidator +} + +type RoundTripValidator struct { + _regex *regexp.Regexp +} + +type GitHubValidator struct { + _regex *regexp.Regexp + _maxNum int +} + +type IgnoreValidator struct { + _regex *regexp.Regexp } type ValidatorType string +const ( + roundtripValidator ValidatorType = "roundtrip" + githubValidator ValidatorType = "github" + ignoreValidator ValidatorType = "ignore" +) + +const ( + gitHubAPIURL = "https://api.github.com/repos/%v/%v?sort=created&direction=desc&per_page=1" +) + +type GitHubResponse struct { + Number int `json:"number"` +} + func ParseConfig(c []byte) (Config, error) { cfg := Config{} dec := yaml.NewDecoder(bytes.NewReader(c)) @@ -46,8 +76,92 @@ func ParseConfig(c []byte) (Config, error) { return Config{}, errors.New("No validator provided") } + // Evaluate regex for given validators. for i := range cfg.Validators { - cfg.Validators[i]._regex = regexp.MustCompile(cfg.Validators[i].Regex) + switch cfg.Validators[i].Type { + case roundtripValidator: + cfg.Validators[i].rtValidator._regex = regexp.MustCompile(cfg.Validators[i].Regex) + case githubValidator: + regex, maxNum, err := getGitHubRegex(cfg.Validators[i].Regex, cfg.Validators[i].Token) + if err != nil { + return Config{}, errors.Wrapf(err, "parsing GitHub Regex %v", err) + } + cfg.Validators[i].ghValidator._regex = regex + cfg.Validators[i].ghValidator._maxNum = maxNum + case ignoreValidator: + cfg.Validators[i].igValidator._regex = regexp.MustCompile(cfg.Validators[i].Regex) + default: + return Config{}, errors.New("Validator type not supported") + } } return cfg, nil } + +// getGitHubRegex returns GitHub pulls/issues regex from repo name. +func getGitHubRegex(repoRe string, repoToken string) (*regexp.Regexp, int, error) { + // Get reponame from regex. + getRepo := regexp.MustCompile(`(?P[A-Za-z0-9_.-]+)\\\/(?P[A-Za-z0-9_.-]+)`) + match := getRepo.FindStringSubmatch(repoRe) + if len(match) != 3 { + return nil, math.MaxInt64, errors.New("repo name regex not valid") + } + reponame := match[1] + "/" + match[2] + + var pullNum []GitHubResponse + var issueNum []GitHubResponse + max := 0 + // All GitHub API reqs need to have User-Agent: https://docs.github.com/en/rest/overview/resources-in-the-rest-api#user-agent-required. + client := &http.Client{} + + // Check latest pull request number. + reqPull, err := http.NewRequest("GET", fmt.Sprintf(gitHubAPIURL, reponame, "pulls"), nil) + if err != nil { + return nil, math.MaxInt64, err + } + reqPull.Header.Set("User-Agent", "mdox") + + // Check latest issue number and return whichever is greater. + reqIssue, err := http.NewRequest("GET", fmt.Sprintf(gitHubAPIURL, reponame, "issues"), nil) + if err != nil { + return nil, math.MaxInt64, err + } + reqIssue.Header.Set("User-Agent", "mdox") + + if repoToken != "" { + reqPull.Header.Set("Authorization", "Bearer "+repoToken) + reqIssue.Header.Set("Authorization", "Bearer "+repoToken) + } + + respPull, err := client.Do(reqPull) + if err != nil { + return nil, math.MaxInt64, err + } + if respPull.StatusCode != 200 { + return nil, math.MaxInt64, errors.New("pulls API request failed. status code: " + strconv.Itoa(respPull.StatusCode)) + } + defer respPull.Body.Close() + if err := json.NewDecoder(respPull.Body).Decode(&pullNum); err != nil { + return nil, math.MaxInt64, err + } + + respIssue, err := client.Do(reqIssue) + if err != nil { + return nil, math.MaxInt64, err + } + if respIssue.StatusCode != 200 { + return nil, math.MaxInt64, errors.New("issues API request failed. status code: " + strconv.Itoa(respIssue.StatusCode)) + } + defer respIssue.Body.Close() + if err := json.NewDecoder(respIssue.Body).Decode(&issueNum); err != nil { + return nil, math.MaxInt64, err + } + + if len(pullNum) > 0 { + max = pullNum[0].Number + } + if len(issueNum) > 0 && issueNum[0].Number > max { + max = issueNum[0].Number + } + + return regexp.MustCompile(`(^http[s]?:\/\/)(www\.)?(github\.com\/)(` + repoRe + `)(\/pull\/|\/issues\/)`), max, nil +} diff --git a/pkg/mdformatter/linktransformer/link.go b/pkg/mdformatter/linktransformer/link.go index 92987ae..2c1475d 100644 --- a/pkg/mdformatter/linktransformer/link.go +++ b/pkg/mdformatter/linktransformer/link.go @@ -136,8 +136,8 @@ type futureResult struct { // NewValidator returns mdformatter.LinkTransformer that crawls all links. // TODO(bwplotka): Add optimization and debug modes - this is the main source of latency and pain. func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir string) (mdformatter.LinkTransformer, error) { - var config Config var err error + config := Config{} if string(linksValidateConfig) != "" { config, err = ParseConfig(linksValidateConfig) if err != nil { @@ -177,10 +177,6 @@ func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir strin defer v.rMu.Unlock() v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible; status code %v", response.Request.URL.String(), response.StatusCode) }) - err = v.validateConfig.validateGH() - if err != nil { - return nil, err - } return v, nil } @@ -244,11 +240,6 @@ func (v *validator) visit(filepath string, dest string) { return } v.destFutures[k] = &futureResult{cases: 1, resultFn: func() error { return nil }} - validator := v.validateConfig.GetValidatorForURL(dest) - if validator.Matched { - return - } - matches := remoteLinkPrefixRe.FindAllStringIndex(dest, 1) if matches == nil { // Relative or absolute path. Check if exists. @@ -260,6 +251,13 @@ func (v *validator) visit(filepath string, dest string) { } return } + validator := v.validateConfig.GetValidatorForURL(dest) + if validator != nil { + matched, err := validator.IsValid(dest) + if matched && err == nil { + return + } + } // Result will be in future. v.destFutures[k].resultFn = func() error { return v.remoteLinks[dest] } diff --git a/pkg/mdformatter/linktransformer/link_test.go b/pkg/mdformatter/linktransformer/link_test.go index 82df411..d15b970 100644 --- a/pkg/mdformatter/linktransformer/link_test.go +++ b/pkg/mdformatter/linktransformer/link_test.go @@ -230,21 +230,21 @@ func TestValidator_TransformDestination(t *testing.T) { testutil.Equals(t, fmt.Sprintf("%v%v: %v%v: \"https://bwplotka.dev/does-not-exists\" not accessible; status code 404: Not Found", tmpDir, filePath, relDirPath, filePath), err.Error()) }) - t.Run("check 404 link with validate config", func(t *testing.T) { + t.Run("check valid & 404 link with validate config", func(t *testing.T) { testFile := filepath.Join(tmpDir, "repo", "docs", "test", "invalid-link2.md") - testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://bwplotka.dev/does-not-exists\n"), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://www.github.com/ https://bwplotka.dev/does-not-exits\n"), os.ModePerm)) diff, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}) testutil.Ok(t, err) testutil.Equals(t, 0, len(diff), diff.String()) _, err = mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer( - MustNewValidator(logger, []byte("version: 1\n\nvalidators:\n - regex: '://bwplotka.dev'\n type: 'roundtrip'\n"), anchorDir), + MustNewValidator(logger, []byte("version: 1\n\nvalidators:\n - regex: 'github'\n type: 'roundtrip'\n"), anchorDir), )) testutil.Ok(t, err) }) - t.Run("check links with validate config", func(t *testing.T) { + t.Run("check 404 links with ignore validate config", func(t *testing.T) { testFile := filepath.Join(tmpDir, "repo", "docs", "test", "links.md") testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://fakelink1.com/ http://fakelink2.com/ https://www.fakelink3.com/\n"), os.ModePerm)) @@ -253,7 +253,7 @@ func TestValidator_TransformDestination(t *testing.T) { testutil.Equals(t, 0, len(diff), diff.String()) _, err = mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer( - MustNewValidator(logger, []byte("version: 1\n\nvalidators:\n - regex: '(^http[s]?:\\/\\/)(www\\.)?(fakelink[0-9]\\.com\\/)'\n type: 'roundtrip'\n"), anchorDir), + MustNewValidator(logger, []byte("version: 1\n\nvalidators:\n - regex: '(^http[s]?:\\/\\/)(www\\.)?(fakelink[0-9]\\.com\\/)'\n type: 'ignore'\n"), anchorDir), )) testutil.Ok(t, err) }) @@ -261,13 +261,15 @@ func TestValidator_TransformDestination(t *testing.T) { t.Run("check github links with validate config", func(t *testing.T) { testFile := filepath.Join(tmpDir, "repo", "docs", "test", "github-link.md") testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://github.com/bwplotka/mdox/issues/23 https://github.com/bwplotka/mdox/pull/32 https://github.com/bwplotka/mdox/pull/27#pullrequestreview-659598194\n"), os.ModePerm)) + // This is substituted in config using PathorContent flag. But need to pass it directly here. + repoToken := os.Getenv("GITHUB_TOKEN") diff, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}) testutil.Ok(t, err) testutil.Equals(t, 0, len(diff), diff.String()) _, err = mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer( - MustNewValidator(logger, []byte("version: 1\n\nvalidators:\n - regex: 'bwplotka\\/mdox'\n type: 'github'\n"), anchorDir), + MustNewValidator(logger, []byte("version: 1\n\nvalidators:\n - regex: 'bwplotka\\/mdox'\n token: '"+repoToken+"'\n type: 'github'\n"), anchorDir), )) testutil.Ok(t, err) }) diff --git a/pkg/mdformatter/linktransformer/validator.go b/pkg/mdformatter/linktransformer/validator.go index afab67d..a7cad7c 100644 --- a/pkg/mdformatter/linktransformer/validator.go +++ b/pkg/mdformatter/linktransformer/validator.go @@ -4,140 +4,73 @@ package linktransformer import ( - "encoding/json" - "fmt" - "math" - "net/http" - "regexp" "strconv" "strings" - - "github.com/pkg/errors" ) -const ( - gitHubAPIURL = "https://api.github.com/repos/%v/%v?sort=created&direction=desc&per_page=1" -) +type Validator interface { + IsValid(URL string) (bool, error) +} + +// GitHubValidator.IsValid skips visiting all github issue/PR links. +func (v GitHubValidator) IsValid(URL string) (bool, error) { + // Find rightmost index of match i.e, where regex match ends. + // This will be where issue/PR number starts. Split incase of section link and convert to int. + rightmostIndex := v._regex.FindStringIndex(URL) + stringNum := strings.Split(URL[rightmostIndex[1]:], "#") + num, err := strconv.Atoi(stringNum[0]) + if err != nil { + return false, err + } + // If number in link does not exceed then link is valid. + if v._maxNum >= num { + return true, nil + } + return false, nil +} -type GitHubResponse struct { - Number int `json:"number"` +// RoundTripValidator.IsValid returns false if url matches, to ensure it is visited by colly. +func (v RoundTripValidator) IsValid(URL string) (bool, error) { + return false, nil } -type URLValidator struct { - Matched bool +// IgnoreValidator.IsValid returns true if matched so that link in not checked. +func (v IgnoreValidator) IsValid(URL string) (bool, error) { + return true, nil } -// GetValidatorForURL matches link with any one of provided validators. -func (v Config) GetValidatorForURL(url string) URLValidator { - u := URLValidator{Matched: false} +// GetValidatorForURL returns correct Validator by matching URL. +func (v Config) GetValidatorForURL(URL string) Validator { + var u Validator for _, val := range v.Validators { - if !val._regex.MatchString(url) { - continue - } - if val.Type == roundtrip { - u.Matched = true + switch val.Type { + case roundtripValidator: + if !val.rtValidator._regex.MatchString(URL) { + continue + } + u = val.rtValidator return u - } - // Find rightmost index of match i.e, where regex match ends. - // This will be where issue/PR number starts. Split incase of section link and convert to int. - rightmostIndex := val._regex.FindStringIndex(url) - stringNum := strings.Split(url[rightmostIndex[1]:], "#") - num, err := strconv.Atoi(stringNum[0]) - // If number in link does not exceed then link is valid. Otherwise will be checked by v.c.Visit. - if val._maxNum >= num && err == nil { - u.Matched = true + case githubValidator: + if !val.ghValidator._regex.MatchString(URL) { + continue + } + u = val.ghValidator return u - } - } - return u -} - -// validateGH changes regex and adds maxNum, if type is "github". -func (v Config) validateGH() error { - for i := range v.Validators { - if v.Validators[i].Type == roundtrip { - continue - } - if v.Validators[i].Regex == "" { - v.Validators[i]._regex = regexp.MustCompile(`^$`) - v.Validators[i]._maxNum = math.MaxInt64 + case ignoreValidator: + if !val.igValidator._regex.MatchString(URL) { + continue + } + u = val.igValidator + return u + default: continue } - regex, maxNum, err := getGitHubRegex(v.Validators[i].Regex, v.Validators[i].Token) - if err != nil { - return err - } - v.Validators[i]._regex = regex - v.Validators[i]._maxNum = maxNum - } - return nil -} - -// getGitHubRegex returns GitHub pulls/issues regex from repo name. -func getGitHubRegex(repoRe string, repoToken string) (*regexp.Regexp, int, error) { - // Get reponame from regex. - getRepo := regexp.MustCompile(`(?P[A-Za-z0-9_.-]+)\\\/(?P[A-Za-z0-9_.-]+)`) - match := getRepo.FindStringSubmatch(repoRe) - if len(match) != 3 { - return nil, math.MaxInt64, errors.New("repo name regex not valid") } - reponame := match[1] + "/" + match[2] - - var pullNum []GitHubResponse - var issueNum []GitHubResponse - max := 0 - // All GitHub API reqs need to have User-Agent: https://docs.github.com/en/rest/overview/resources-in-the-rest-api#user-agent-required. - client := &http.Client{} - - // Check latest pull request number. - reqPull, err := http.NewRequest("GET", fmt.Sprintf(gitHubAPIURL, reponame, "pulls"), nil) - if err != nil { - return nil, math.MaxInt64, err - } - reqPull.Header.Set("User-Agent", "mdox") - - // Check latest issue number and return whichever is greater. - reqIssue, err := http.NewRequest("GET", fmt.Sprintf(gitHubAPIURL, reponame, "issues"), nil) - if err != nil { - return nil, math.MaxInt64, err - } - reqIssue.Header.Set("User-Agent", "mdox") - - if repoToken != "" { - reqPull.Header.Set("Authorization", "Bearer "+repoToken) - reqIssue.Header.Set("Authorization", "Bearer "+repoToken) - } - - respPull, err := client.Do(reqPull) - if err != nil { - return nil, math.MaxInt64, err - } - if respPull.StatusCode != 200 { - return nil, math.MaxInt64, errors.New("pulls API request failed. status code: " + strconv.Itoa(respPull.StatusCode)) + // By default all links are ignored. + u = IgnoreValidator{} + // No config file passed, so all links must be checked. + if len(v.Validators) == 0 { + u = nil } - defer respPull.Body.Close() - if err := json.NewDecoder(respPull.Body).Decode(&pullNum); err != nil { - return nil, math.MaxInt64, err - } - - respIssue, err := client.Do(reqIssue) - if err != nil { - return nil, math.MaxInt64, err - } - if respIssue.StatusCode != 200 { - return nil, math.MaxInt64, errors.New("issues API request failed. status code: " + strconv.Itoa(respIssue.StatusCode)) - } - defer respIssue.Body.Close() - if err := json.NewDecoder(respIssue.Body).Decode(&issueNum); err != nil { - return nil, math.MaxInt64, err - } - - if len(pullNum) > 0 { - max = pullNum[0].Number - } - if len(issueNum) > 0 && issueNum[0].Number > max { - max = issueNum[0].Number - } - - return regexp.MustCompile(`(^http[s]?:\/\/)(www\.)?(github\.com\/)(` + repoRe + `)(\/pull\/|\/issues\/)`), max, nil + return u }