Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RF: linked pages #47

Merged
merged 3 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions checks/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,28 @@
)

type Checks struct {
Carbon *Carbon
Headers *Headers
IpAddress *Ip
LegacyRank *LegacyRank
Rank *Rank
SocialTags *SocialTags
Tls *Tls
Carbon *Carbon
Headers *Headers
IpAddress *Ip
LegacyRank *LegacyRank
LinkedPages *LinkedPages
Rank *Rank
SocialTags *SocialTags
Tls *Tls
}

func NewChecks() *Checks {
client := &http.Client{
Timeout: 5 * time.Second,
}
return &Checks{
Carbon: NewCarbon(client),
Carbon: NewCarbon(client),

Check warning on line 26 in checks/checks.go

View check run for this annotation

Codecov / codecov/patch

checks/checks.go#L26

Added line #L26 was not covered by tests
Headers: NewHeaders(client),
IpAddress: NewIp(NewNetIp()),
LegacyRank: NewLegacyRank(legacyrank.NewInMemoryStore()),
Rank: NewRank(client),
SocialTags: NewSocialTags(client),
Tls: NewTls(client),
IpAddress: NewIp(NewNetIp()),
LegacyRank: NewLegacyRank(legacyrank.NewInMemoryStore()),
LinkedPages: NewLinkedPages(client),
Rank: NewRank(client),
SocialTags: NewSocialTags(client),
Tls: NewTls(client),

Check warning on line 33 in checks/checks.go

View check run for this annotation

Codecov / codecov/patch

checks/checks.go#L28-L33

Added lines #L28 - L33 were not covered by tests
}
}
120 changes: 120 additions & 0 deletions checks/linked_pages.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package checks

import (
"context"
"fmt"
"net/http"
"net/url"
"sort"
"strings"

"golang.org/x/net/html"
)

type LinkedPagesData struct {
Internal []string `json:"internal"`
External []string `json:"external"`
}

type LinkedPages struct {
client *http.Client
}

func NewLinkedPages(client *http.Client) *LinkedPages {
return &LinkedPages{client: client}
}

func (l *LinkedPages) GetLinkedPages(ctx context.Context, targetURL *url.URL) (LinkedPagesData, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, targetURL.String(), nil)
if err != nil {
return LinkedPagesData{}, err

Check warning on line 30 in checks/linked_pages.go

View check run for this annotation

Codecov / codecov/patch

checks/linked_pages.go#L30

Added line #L30 was not covered by tests
}

resp, err := l.client.Do(req)
if err != nil {
return LinkedPagesData{}, err

Check warning on line 35 in checks/linked_pages.go

View check run for this annotation

Codecov / codecov/patch

checks/linked_pages.go#L35

Added line #L35 was not covered by tests
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return LinkedPagesData{}, fmt.Errorf("received non-200 response code")

Check warning on line 40 in checks/linked_pages.go

View check run for this annotation

Codecov / codecov/patch

checks/linked_pages.go#L40

Added line #L40 was not covered by tests
}

doc, err := html.Parse(resp.Body)
if err != nil {
return LinkedPagesData{}, err

Check warning on line 45 in checks/linked_pages.go

View check run for this annotation

Codecov / codecov/patch

checks/linked_pages.go#L45

Added line #L45 was not covered by tests
}

internalLinksMap := make(map[string]int)
externalLinksMap := make(map[string]int)
walkDom(doc, targetURL, internalLinksMap, externalLinksMap)

return LinkedPagesData{
Internal: sortURLsByFrequency(internalLinksMap),
External: sortURLsByFrequency(externalLinksMap),
}, nil
}

func walkDom(n *html.Node, parsedTargetURL *url.URL, internalLinksMap map[string]int, externalLinksMap map[string]int) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
href := attr.Val
absoluteURL, err := resolveURL(parsedTargetURL, href)
if err != nil {
continue
}
if strings.TrimPrefix(absoluteURL.Hostname(), "www.") == parsedTargetURL.Hostname() {
internalLinksMap[absoluteURL.String()]++
} else if absoluteURL.Scheme == "http" || absoluteURL.Scheme == "https" {
externalLinksMap[absoluteURL.String()]++
}
break
}
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
walkDom(child, parsedTargetURL, internalLinksMap, externalLinksMap)
}
}

// NOTE: This function resolves a href based on how it would be interpreted by the browser, and does NOT check for typos or whether the URL is reachable.
// Only hrefs containing a scheme or beginning with "//" (denoting a relative scheme) will be resolved as absolute URLs.
// E.g. A href of "http//example.com" will resolve against a base url of "http://example.com" as "http://example.com/http//example.com" since this is how the browser will interpret it
func resolveURL(baseURL *url.URL, href string) (*url.URL, error) {
u, err := url.Parse(href)
if err != nil {
return nil, err
}
if u.Scheme == "" {
return baseURL.ResolveReference(u), nil
}
return u, nil
}

func sortURLsByFrequency(linksMap map[string]int) []string {
type link struct {
URL string
Frequency int
}

var links []link
for k, v := range linksMap {
links = append(links, link{k, v})
}

sort.SliceStable(links, func(i, j int) bool {
return links[i].URL < links[j].URL
})

sort.SliceStable(links, func(i, j int) bool {
return links[i].Frequency > links[j].Frequency
})

var sortedLinks []string
for _, link := range links {
sortedLinks = append(sortedLinks, link.URL)
}

return sortedLinks
}
146 changes: 146 additions & 0 deletions checks/linked_pages_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
package checks

import (
"context"
"net/http"
"net/url"
"testing"

"github.com/stretchr/testify/assert"
"github.com/xray-web/web-check-api/testutils"
)

func TestGetLinkedPages(t *testing.T) {
t.Parallel()
testTargetURL := &url.URL{
Scheme: "http",
Host: "internal.com",
}
testHTML := []byte(`
<a href="http://internal.com/#heading"></a>
<a href="//internal.com/1"></a>
<a href="2"></a>
<a href="/2"></a>
<a href="http://external.com/1"></a>
<a href="https://external.com/2"></a>
<a href="http://external.com/2"></a>
<a href="://external.com"></a>
`)
client := testutils.MockClient(testutils.Response(http.StatusOK, testHTML))
actualLinkedPagesData, err := NewLinkedPages(client).GetLinkedPages(context.TODO(), testTargetURL)
assert.NoError(t, err)
assert.Equal(t, LinkedPagesData{
Internal: []string{
"http://internal.com/2",
"http://internal.com/#heading",
"http://internal.com/1",
},
External: []string{
"http://external.com/1",
"http://external.com/2",
"https://external.com/2",
},
}, actualLinkedPagesData)
}

func TestResolveURL(t *testing.T) {
t.Parallel()
baseURL := url.URL{
Scheme: "http",
Host: "example.com",
}

tests := []struct {
name string
href string
expectedResolvedURL (*url.URL)
expectedErrorExists bool
}{
{
name: "empty href",
href: "",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com"},
expectedErrorExists: false,
},
{
name: "missing scheme",
href: "://example.com",
expectedResolvedURL: nil,
expectedErrorExists: true,
},
{
name: "relative scheme",
href: "//example.com",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com"},
expectedErrorExists: false,
},
{
name: "valid absolute url without path",
href: "http://example.com",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com"},
expectedErrorExists: false,
},
{
name: "valid absolute url with path",
href: "http://example.com/123",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/123"},
expectedErrorExists: false,
},
{
name: "valid relative url with leading slash",
href: "/123",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/123"},
expectedErrorExists: false,
},
{
name: "valid relative url without leading slash",
href: "123",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/123"},
expectedErrorExists: false,
},
{
name: "valid relative url edge case",
href: "http//example.com",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/http//example.com"},
expectedErrorExists: false,
},
{
name: "valid relative url fragment",
href: "#heading",
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Fragment: "heading"},
expectedErrorExists: false,
},
}

for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
actualResolvedURL, err := resolveURL(&baseURL, tc.href)
assert.Equal(t, tc.expectedResolvedURL, actualResolvedURL)
if tc.expectedErrorExists {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
})
}
}

func TestSortURLsByFrequency(t *testing.T) {
t.Parallel()
testLinksMap := map[string]int{
"https://example.com": 1,
"https://example2.com": 2,
"https://example3.com": 3,
}

expectedSortedLinks := []string{
"https://example3.com",
"https://example2.com",
"https://example.com",
}

actualSortedLinks := sortURLsByFrequency(testLinksMap)
assert.Equal(t, expectedSortedLinks, actualSortedLinks)
}
Loading