From 88790de6463bd93461cb7c8f6a83529004722249 Mon Sep 17 00:00:00 2001 From: Saswata Mukherjee Date: Thu, 27 May 2021 11:36:57 +0530 Subject: [PATCH] Add links.validate.config flag Signed-off-by: Saswata Mukherjee --- README.md | 8 +- main.go | 4 +- pkg/mdformatter/linktransformer/config.go | 64 +++++++++++ pkg/mdformatter/linktransformer/link.go | 106 ++++-------------- pkg/mdformatter/linktransformer/link_test.go | 24 +++- pkg/mdformatter/linktransformer/validator.go | 112 +++++++++++++++++++ 6 files changed, 226 insertions(+), 92 deletions(-) create mode 100644 pkg/mdformatter/linktransformer/config.go create mode 100644 pkg/mdformatter/linktransformer/validator.go diff --git a/README.md b/README.md index 2e9acd2..36ad85c 100644 --- a/README.md +++ b/README.md @@ -63,10 +63,10 @@ Flags: --links.validate.without-address-regex=^$ If specified, all links will be validated, except those matching the given target address. - --links.validate.without-github-links="" - If specified, all links will be validated, - except the GitHub links for PRs and issues of - the given repo. + --links.validate.config="" + Path to YAML file for skipping link check, with + spec defined in + github.com/bwplotka/mdox/pkg/linktranformer.Config Args: Markdown file(s) to process. diff --git a/main.go b/main.go index 49f8fb3..fe06e88 100644 --- a/main.go +++ b/main.go @@ -127,7 +127,7 @@ This directive runs executable with arguments and put its stderr and stdout outp // TODO(bwplotka): Add cache in file? linksValidateEnabled := cmd.Flag("links.validate", "If true, all links will be validated").Short('l').Bool() linksValidateExceptDomains := cmd.Flag("links.validate.without-address-regex", "If specified, all links will be validated, except those matching the given target address.").Default(`^$`).Regexp() - linksSkipGitHub := cmd.Flag("links.validate.without-github-links", "If specified, all links will be validated, except the GitHub links for PRs and issues of the given repo.").Default("").String() + linksValidateConfig := cmd.Flag("links.validate.config", "Path to YAML file for skipping link check, with spec defined in github.com/bwplotka/mdox/pkg/linktranformer.Config").Default("").String() cmd.Run(func(ctx context.Context, logger log.Logger) (err error) { var opts []mdformatter.Option @@ -152,7 +152,7 @@ This directive runs executable with arguments and put its stderr and stdout outp var linkTr []mdformatter.LinkTransformer if *linksValidateEnabled { - v, err := linktransformer.NewValidator(logger, *linksValidateExceptDomains, *linksSkipGitHub, anchorDir) + v, err := linktransformer.NewValidator(logger, *linksValidateExceptDomains, *linksValidateConfig, anchorDir) if err != nil { return err } diff --git a/pkg/mdformatter/linktransformer/config.go b/pkg/mdformatter/linktransformer/config.go new file mode 100644 index 0000000..6c444ac --- /dev/null +++ b/pkg/mdformatter/linktransformer/config.go @@ -0,0 +1,64 @@ +// Copyright (c) Bartłomiej Płotka @bwplotka +// Licensed under the Apache License 2.0. + +package linktransformer + +import ( + "bytes" + "io/ioutil" + "path/filepath" + "regexp" + + "github.com/pkg/errors" + "gopkg.in/yaml.v3" +) + +type Config struct { + Version int + + Validate struct { + Validators []Validator `yaml:"validators"` + } +} + +type Validator struct { + _regex *regexp.Regexp + _maxnum int + // Regex for type github is reponame matcher, like `bwplotka\/mdox`. + Regex string `yaml:"regex"` + // By default type is `roundtrip`. Could be `github`. + Type string `yaml:"type"` +} + +func parseConfigFile(configFile string) (Config, error) { + if configFile == "" { + return Config{}, nil + } + configFile, err := filepath.Abs(configFile) + if err != nil { + return Config{}, errors.Wrap(err, "abs") + } + c, err := ioutil.ReadFile(configFile) + if err != nil { + return Config{}, errors.Wrap(err, "read config file") + } + return ParseConfig(c) +} + +func ParseConfig(c []byte) (Config, error) { + cfg := Config{} + dec := yaml.NewDecoder(bytes.NewReader(c)) + dec.KnownFields(true) + if err := dec.Decode(&cfg); err != nil { + return Config{}, errors.Wrapf(err, "parsing YAML content %q", string(c)) + } + + if len(cfg.Validate.Validators) <= 0 { + return Config{}, errors.New("No validator provided") + } + + for i := range cfg.Validate.Validators { + cfg.Validate.Validators[i]._regex = regexp.MustCompile(cfg.Validate.Validators[i].Regex) + } + return cfg, nil +} diff --git a/pkg/mdformatter/linktransformer/link.go b/pkg/mdformatter/linktransformer/link.go index 5fcd849..45c4350 100644 --- a/pkg/mdformatter/linktransformer/link.go +++ b/pkg/mdformatter/linktransformer/link.go @@ -6,16 +6,11 @@ package linktransformer import ( "bufio" "bytes" - "encoding/json" - "fmt" "io" - "math" - "net/http" "os" "path/filepath" "regexp" "sort" - "strconv" "strings" "sync" @@ -38,7 +33,6 @@ var ( const ( originalURLKey = "originalURLKey" - gitHubAPIURL = "https://api.github.com/repos/%v/%v?sort=created&direction=desc&per_page=1" ) type chain struct { @@ -116,11 +110,10 @@ func (l *localizer) TransformDestination(ctx mdformatter.SourceContext, destinat func (l *localizer) Close(mdformatter.SourceContext) error { return nil } type validator struct { - logger log.Logger - anchorDir string - except *regexp.Regexp - skipGitHub *regexp.Regexp - gitHubNum int + logger log.Logger + anchorDir string + except *regexp.Regexp + validateConfig Config localLinks localLinksCache rMu sync.RWMutex @@ -143,21 +136,20 @@ type futureResult struct { // NewValidator returns mdformatter.LinkTransformer that crawls all links. // TODO(bwplotka): Add optimization and debug modes - this is the main source of latency and pain. -func NewValidator(logger log.Logger, except *regexp.Regexp, repo string, anchorDir string) (mdformatter.LinkTransformer, error) { - skipGitHub, gitHubNum, err := getGitHubRegex(repo) +func NewValidator(logger log.Logger, except *regexp.Regexp, linksValidateConfig string, anchorDir string) (mdformatter.LinkTransformer, error) { + config, err := parseConfigFile(linksValidateConfig) if err != nil { return nil, err } v := &validator{ - logger: logger, - anchorDir: anchorDir, - except: except, - skipGitHub: skipGitHub, - gitHubNum: gitHubNum, - localLinks: map[string]*[]string{}, - remoteLinks: map[string]error{}, - c: colly.NewCollector(colly.Async()), - destFutures: map[futureKey]*futureResult{}, + logger: logger, + anchorDir: anchorDir, + except: except, + validateConfig: config, + localLinks: map[string]*[]string{}, + remoteLinks: map[string]error{}, + c: colly.NewCollector(colly.Async()), + destFutures: map[futureKey]*futureResult{}, } // Set very soft limits. // E.g github has 50-5000 https://docs.github.com/en/free-pro-team@latest/rest/reference/rate-limit limit depending @@ -183,62 +175,16 @@ func NewValidator(logger log.Logger, except *regexp.Regexp, repo string, anchorD defer v.rMu.Unlock() v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible; status code %v", response.Request.URL.String(), response.StatusCode) }) - return v, nil -} - -type GitHubResponse struct { - Number int `json:"number"` -} - -func getGitHubRegex(reponame string) (*regexp.Regexp, int, error) { - if reponame != "" { - var pullNum []GitHubResponse - var issueNum []GitHubResponse - max := 0 - // Check latest pull request number. - respPull, err := http.Get(fmt.Sprintf(gitHubAPIURL, reponame, "pulls")) - if err != nil { - return nil, math.MaxInt64, err - } - if respPull.StatusCode != 200 { - return nil, math.MaxInt64, errors.New("pulls API request failed. status code: " + strconv.Itoa(respPull.StatusCode)) - } - defer respPull.Body.Close() - if err := json.NewDecoder(respPull.Body).Decode(&pullNum); err != nil { - return nil, math.MaxInt64, err - } - if len(pullNum) > 0 { - max = pullNum[0].Number - } - - // Check latest issue number and return whichever is greater. - respIssue, err := http.Get(fmt.Sprintf(gitHubAPIURL, reponame, "issues")) - if err != nil { - return nil, math.MaxInt64, err - } - if respIssue.StatusCode != 200 { - return nil, math.MaxInt64, errors.New("issues API request failed. status code: " + strconv.Itoa(respIssue.StatusCode)) - } - defer respIssue.Body.Close() - if err := json.NewDecoder(respIssue.Body).Decode(&issueNum); err != nil { - return nil, math.MaxInt64, err - } - if len(issueNum) > 0 && issueNum[0].Number > max { - max = issueNum[0].Number - } - - // Place forward slash between org and repo to escape slash character. - idx := strings.Index(reponame, "/") - reponame = reponame[:idx] + `\` + reponame[idx:] - return regexp.MustCompile(`(^http[s]?:\/\/)(www\.)?(github\.com\/)(` + reponame + `)(\/pull\/|\/issues\/)`), max, nil + err = CheckGitHub(v.validateConfig) + if err != nil { + return nil, err } - - return regexp.MustCompile(`^$`), math.MaxInt64, nil + return v, nil } // MustNewValidator returns mdformatter.LinkTransformer that crawls all links. -func MustNewValidator(logger log.Logger, except *regexp.Regexp, reponame string, anchorDir string) mdformatter.LinkTransformer { - v, err := NewValidator(logger, except, reponame, anchorDir) +func MustNewValidator(logger log.Logger, except *regexp.Regexp, linksValidateConfig string, anchorDir string) mdformatter.LinkTransformer { + v, err := NewValidator(logger, except, linksValidateConfig, anchorDir) if err != nil { panic(err) } @@ -299,16 +245,8 @@ func (v *validator) visit(filepath string, dest string) { if v.except.MatchString(dest) { return } - if v.skipGitHub.MatchString(dest) { - // Find rightmost index of match i.e, where regex match ends. - // This will be where issue/PR number starts. Split incase of section link and convert to int. - idx := v.skipGitHub.FindStringIndex(dest) - stringNum := strings.Split(dest[idx[1]:], "#") - num, err := strconv.Atoi(stringNum[0]) - // If number in link does not exceed then link is valid. Otherwise will be checked by v.c.Visit. - if v.gitHubNum >= num && err == nil { - return - } + if CheckValidators(dest, v.validateConfig) { + return } matches := remoteLinkPrefixRe.FindAllStringIndex(dest, 1) diff --git a/pkg/mdformatter/linktransformer/link_test.go b/pkg/mdformatter/linktransformer/link_test.go index 28d51b7..8c20d4a 100644 --- a/pkg/mdformatter/linktransformer/link_test.go +++ b/pkg/mdformatter/linktransformer/link_test.go @@ -244,16 +244,36 @@ func TestValidator_TransformDestination(t *testing.T) { testutil.Ok(t, err) }) - t.Run("check github links, skipped", func(t *testing.T) { + t.Run("check links with validate config", func(t *testing.T) { + testFile := filepath.Join(tmpDir, "repo", "docs", "test", "links.md") + mdoxFile := filepath.Join(tmpDir, "mdox.yaml") + + testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://fakelink1.com/ http://fakelink2.com/ https://www.fakelink3.com/\n"), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(mdoxFile, []byte("version: 1\n\nvalidate:\n validators:\n - regex: '(^http[s]?:\\/\\/)(www\\.)?(fakelink[0-9]\\.com\\/)'\n type: 'roundtrip'\n"), os.ModePerm)) + + diff, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}) + testutil.Ok(t, err) + testutil.Equals(t, 0, len(diff), diff.String()) + + _, err = mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer( + MustNewValidator(logger, regexp.MustCompile(`^$`), mdoxFile, anchorDir), + )) + testutil.Ok(t, err) + }) + + t.Run("check github links with validate config", func(t *testing.T) { testFile := filepath.Join(tmpDir, "repo", "docs", "test", "github-link.md") + mdoxFile := filepath.Join(tmpDir, "mdox.yaml") + testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://github.com/bwplotka/mdox/issues/23 https://github.com/bwplotka/mdox/pull/32 https://github.com/bwplotka/mdox/pull/27#pullrequestreview-659598194\n"), os.ModePerm)) + testutil.Ok(t, ioutil.WriteFile(mdoxFile, []byte("version: 1\n\nvalidate:\n validators:\n - regex: 'bwplotka\\/mdox'\n type: 'github'\n"), os.ModePerm)) diff, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}) testutil.Ok(t, err) testutil.Equals(t, 0, len(diff), diff.String()) _, err = mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer( - MustNewValidator(logger, regexp.MustCompile(`^$`), "bwplotka/mdox", anchorDir), + MustNewValidator(logger, regexp.MustCompile(`^$`), mdoxFile, anchorDir), )) testutil.Ok(t, err) }) diff --git a/pkg/mdformatter/linktransformer/validator.go b/pkg/mdformatter/linktransformer/validator.go new file mode 100644 index 0000000..21a02f7 --- /dev/null +++ b/pkg/mdformatter/linktransformer/validator.go @@ -0,0 +1,112 @@ +// Copyright (c) Bartłomiej Płotka @bwplotka +// Licensed under the Apache License 2.0. + +package linktransformer + +import ( + "encoding/json" + "fmt" + "math" + "net/http" + "regexp" + "strconv" + "strings" + + "github.com/pkg/errors" +) + +const ( + gitHubAPIURL = "https://api.github.com/repos/%v/%v?sort=created&direction=desc&per_page=1" +) + +type GitHubResponse struct { + Number int `json:"number"` +} + +// Match link with any one of provided validators. +func CheckValidators(dest string, v Config) bool { + for _, val := range v.Validate.Validators { + if val._regex.MatchString(dest) { + if val.Type == "github" { + // Find rightmost index of match i.e, where regex match ends. + // This will be where issue/PR number starts. Split incase of section link and convert to int. + idx := val._regex.FindStringIndex(dest) + stringNum := strings.Split(dest[idx[1]:], "#") + num, err := strconv.Atoi(stringNum[0]) + // If number in link does not exceed then link is valid. Otherwise will be checked by v.c.Visit. + if val._maxnum >= num && err == nil { + return true + } + return false + } + return true + } + } + return false +} + +// If type is "github", change regex and add maxnum. +func CheckGitHub(v Config) error { + for i := range v.Validate.Validators { + if v.Validate.Validators[i].Type == "github" { + regex, maxnum, err := getGitHubRegex(v.Validate.Validators[i].Regex) + if err != nil { + return err + } + v.Validate.Validators[i]._regex = regex + v.Validate.Validators[i]._maxnum = maxnum + } + } + return nil +} + +// Get GitHub pulls/issues regex from repo name. +func getGitHubRegex(repoRe string) (*regexp.Regexp, int, error) { + if repoRe != "" { + // Get reponame from regex. + idx := strings.Index(repoRe, `\`) + if idx == -1 { + return nil, math.MaxInt64, errors.New("repo name regex not valid") + } + reponame := repoRe[:idx] + repoRe[idx+1:] + + var pullNum []GitHubResponse + var issueNum []GitHubResponse + max := 0 + // Check latest pull request number. + respPull, err := http.Get(fmt.Sprintf(gitHubAPIURL, reponame, "pulls")) + if err != nil { + return nil, math.MaxInt64, err + } + if respPull.StatusCode != 200 { + return nil, math.MaxInt64, errors.New("pulls API request failed. status code: " + strconv.Itoa(respPull.StatusCode)) + } + defer respPull.Body.Close() + if err := json.NewDecoder(respPull.Body).Decode(&pullNum); err != nil { + return nil, math.MaxInt64, err + } + if len(pullNum) > 0 { + max = pullNum[0].Number + } + + // Check latest issue number and return whichever is greater. + respIssue, err := http.Get(fmt.Sprintf(gitHubAPIURL, reponame, "issues")) + if err != nil { + return nil, math.MaxInt64, err + } + if respIssue.StatusCode != 200 { + return nil, math.MaxInt64, errors.New("issues API request failed. status code: " + strconv.Itoa(respIssue.StatusCode)) + } + defer respIssue.Body.Close() + if err := json.NewDecoder(respIssue.Body).Decode(&issueNum); err != nil { + return nil, math.MaxInt64, err + } + if len(issueNum) > 0 && issueNum[0].Number > max { + max = issueNum[0].Number + } + + return regexp.MustCompile(`(^http[s]?:\/\/)(www\.)?(github\.com\/)(` + repoRe + `)(\/pull\/|\/issues\/)`), max, nil + } + + return regexp.MustCompile(`^$`), math.MaxInt64, nil +}