Skip to content

Commit 387d79f

Browse files
vleong99kynrai
andauthored
RF: moved linked pages to checks, added tests (#47)
Co-authored-by: Steven Lee <steven.lee@kynrai.com>
1 parent 4fc4787 commit 387d79f

File tree

6 files changed

+356
-147
lines changed

6 files changed

+356
-147
lines changed

checks/checks.go

+15-13
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,28 @@ import (
88
)
99

1010
type Checks struct {
11-
Carbon *Carbon
12-
Headers *Headers
13-
IpAddress *Ip
14-
LegacyRank *LegacyRank
15-
Rank *Rank
16-
SocialTags *SocialTags
17-
Tls *Tls
11+
Carbon *Carbon
12+
Headers *Headers
13+
IpAddress *Ip
14+
LegacyRank *LegacyRank
15+
LinkedPages *LinkedPages
16+
Rank *Rank
17+
SocialTags *SocialTags
18+
Tls *Tls
1819
}
1920

2021
func NewChecks() *Checks {
2122
client := &http.Client{
2223
Timeout: 5 * time.Second,
2324
}
2425
return &Checks{
25-
Carbon: NewCarbon(client),
26+
Carbon: NewCarbon(client),
2627
Headers: NewHeaders(client),
27-
IpAddress: NewIp(NewNetIp()),
28-
LegacyRank: NewLegacyRank(legacyrank.NewInMemoryStore()),
29-
Rank: NewRank(client),
30-
SocialTags: NewSocialTags(client),
31-
Tls: NewTls(client),
28+
IpAddress: NewIp(NewNetIp()),
29+
LegacyRank: NewLegacyRank(legacyrank.NewInMemoryStore()),
30+
LinkedPages: NewLinkedPages(client),
31+
Rank: NewRank(client),
32+
SocialTags: NewSocialTags(client),
33+
Tls: NewTls(client),
3234
}
3335
}

checks/linked_pages.go

+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"net/http"
7+
"net/url"
8+
"sort"
9+
"strings"
10+
11+
"golang.org/x/net/html"
12+
)
13+
14+
type LinkedPagesData struct {
15+
Internal []string `json:"internal"`
16+
External []string `json:"external"`
17+
}
18+
19+
type LinkedPages struct {
20+
client *http.Client
21+
}
22+
23+
func NewLinkedPages(client *http.Client) *LinkedPages {
24+
return &LinkedPages{client: client}
25+
}
26+
27+
func (l *LinkedPages) GetLinkedPages(ctx context.Context, targetURL *url.URL) (LinkedPagesData, error) {
28+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, targetURL.String(), nil)
29+
if err != nil {
30+
return LinkedPagesData{}, err
31+
}
32+
33+
resp, err := l.client.Do(req)
34+
if err != nil {
35+
return LinkedPagesData{}, err
36+
}
37+
defer resp.Body.Close()
38+
39+
if resp.StatusCode != http.StatusOK {
40+
return LinkedPagesData{}, fmt.Errorf("received non-200 response code")
41+
}
42+
43+
doc, err := html.Parse(resp.Body)
44+
if err != nil {
45+
return LinkedPagesData{}, err
46+
}
47+
48+
internalLinksMap := make(map[string]int)
49+
externalLinksMap := make(map[string]int)
50+
walkDom(doc, targetURL, internalLinksMap, externalLinksMap)
51+
52+
return LinkedPagesData{
53+
Internal: sortURLsByFrequency(internalLinksMap),
54+
External: sortURLsByFrequency(externalLinksMap),
55+
}, nil
56+
}
57+
58+
func walkDom(n *html.Node, parsedTargetURL *url.URL, internalLinksMap map[string]int, externalLinksMap map[string]int) {
59+
if n.Type == html.ElementNode && n.Data == "a" {
60+
for _, attr := range n.Attr {
61+
if attr.Key == "href" {
62+
href := attr.Val
63+
absoluteURL, err := resolveURL(parsedTargetURL, href)
64+
if err != nil {
65+
continue
66+
}
67+
if strings.TrimPrefix(absoluteURL.Hostname(), "www.") == parsedTargetURL.Hostname() {
68+
internalLinksMap[absoluteURL.String()]++
69+
} else if absoluteURL.Scheme == "http" || absoluteURL.Scheme == "https" {
70+
externalLinksMap[absoluteURL.String()]++
71+
}
72+
break
73+
}
74+
}
75+
}
76+
for child := n.FirstChild; child != nil; child = child.NextSibling {
77+
walkDom(child, parsedTargetURL, internalLinksMap, externalLinksMap)
78+
}
79+
}
80+
81+
// NOTE: This function resolves a href based on how it would be interpreted by the browser, and does NOT check for typos or whether the URL is reachable.
82+
// Only hrefs containing a scheme or beginning with "//" (denoting a relative scheme) will be resolved as absolute URLs.
83+
// E.g. A href of "http//example.com" will resolve against a base url of "http://example.com" as "http://example.com/http//example.com" since this is how the browser will interpret it
84+
func resolveURL(baseURL *url.URL, href string) (*url.URL, error) {
85+
u, err := url.Parse(href)
86+
if err != nil {
87+
return nil, err
88+
}
89+
if u.Scheme == "" {
90+
return baseURL.ResolveReference(u), nil
91+
}
92+
return u, nil
93+
}
94+
95+
func sortURLsByFrequency(linksMap map[string]int) []string {
96+
type link struct {
97+
URL string
98+
Frequency int
99+
}
100+
101+
var links []link
102+
for k, v := range linksMap {
103+
links = append(links, link{k, v})
104+
}
105+
106+
sort.SliceStable(links, func(i, j int) bool {
107+
return links[i].URL < links[j].URL
108+
})
109+
110+
sort.SliceStable(links, func(i, j int) bool {
111+
return links[i].Frequency > links[j].Frequency
112+
})
113+
114+
var sortedLinks []string
115+
for _, link := range links {
116+
sortedLinks = append(sortedLinks, link.URL)
117+
}
118+
119+
return sortedLinks
120+
}

checks/linked_pages_test.go

+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
"net/http"
6+
"net/url"
7+
"testing"
8+
9+
"github.com/stretchr/testify/assert"
10+
"github.com/xray-web/web-check-api/testutils"
11+
)
12+
13+
func TestGetLinkedPages(t *testing.T) {
14+
t.Parallel()
15+
testTargetURL := &url.URL{
16+
Scheme: "http",
17+
Host: "internal.com",
18+
}
19+
testHTML := []byte(`
20+
<a href="http://internal.com/#heading"></a>
21+
<a href="//internal.com/1"></a>
22+
<a href="2"></a>
23+
<a href="/2"></a>
24+
<a href="http://external.com/1"></a>
25+
<a href="https://external.com/2"></a>
26+
<a href="http://external.com/2"></a>
27+
<a href="://external.com"></a>
28+
`)
29+
client := testutils.MockClient(testutils.Response(http.StatusOK, testHTML))
30+
actualLinkedPagesData, err := NewLinkedPages(client).GetLinkedPages(context.TODO(), testTargetURL)
31+
assert.NoError(t, err)
32+
assert.Equal(t, LinkedPagesData{
33+
Internal: []string{
34+
"http://internal.com/2",
35+
"http://internal.com/#heading",
36+
"http://internal.com/1",
37+
},
38+
External: []string{
39+
"http://external.com/1",
40+
"http://external.com/2",
41+
"https://external.com/2",
42+
},
43+
}, actualLinkedPagesData)
44+
}
45+
46+
func TestResolveURL(t *testing.T) {
47+
t.Parallel()
48+
baseURL := url.URL{
49+
Scheme: "http",
50+
Host: "example.com",
51+
}
52+
53+
tests := []struct {
54+
name string
55+
href string
56+
expectedResolvedURL (*url.URL)
57+
expectedErrorExists bool
58+
}{
59+
{
60+
name: "empty href",
61+
href: "",
62+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com"},
63+
expectedErrorExists: false,
64+
},
65+
{
66+
name: "missing scheme",
67+
href: "://example.com",
68+
expectedResolvedURL: nil,
69+
expectedErrorExists: true,
70+
},
71+
{
72+
name: "relative scheme",
73+
href: "//example.com",
74+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com"},
75+
expectedErrorExists: false,
76+
},
77+
{
78+
name: "valid absolute url without path",
79+
href: "http://example.com",
80+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com"},
81+
expectedErrorExists: false,
82+
},
83+
{
84+
name: "valid absolute url with path",
85+
href: "http://example.com/123",
86+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/123"},
87+
expectedErrorExists: false,
88+
},
89+
{
90+
name: "valid relative url with leading slash",
91+
href: "/123",
92+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/123"},
93+
expectedErrorExists: false,
94+
},
95+
{
96+
name: "valid relative url without leading slash",
97+
href: "123",
98+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/123"},
99+
expectedErrorExists: false,
100+
},
101+
{
102+
name: "valid relative url edge case",
103+
href: "http//example.com",
104+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Path: "/http//example.com"},
105+
expectedErrorExists: false,
106+
},
107+
{
108+
name: "valid relative url fragment",
109+
href: "#heading",
110+
expectedResolvedURL: &url.URL{Scheme: "http", Host: "example.com", Fragment: "heading"},
111+
expectedErrorExists: false,
112+
},
113+
}
114+
115+
for _, tc := range tests {
116+
tc := tc
117+
t.Run(tc.name, func(t *testing.T) {
118+
t.Parallel()
119+
actualResolvedURL, err := resolveURL(&baseURL, tc.href)
120+
assert.Equal(t, tc.expectedResolvedURL, actualResolvedURL)
121+
if tc.expectedErrorExists {
122+
assert.Error(t, err)
123+
} else {
124+
assert.NoError(t, err)
125+
}
126+
})
127+
}
128+
}
129+
130+
func TestSortURLsByFrequency(t *testing.T) {
131+
t.Parallel()
132+
testLinksMap := map[string]int{
133+
"https://example.com": 1,
134+
"https://example2.com": 2,
135+
"https://example3.com": 3,
136+
}
137+
138+
expectedSortedLinks := []string{
139+
"https://example3.com",
140+
"https://example2.com",
141+
"https://example.com",
142+
}
143+
144+
actualSortedLinks := sortURLsByFrequency(testLinksMap)
145+
assert.Equal(t, expectedSortedLinks, actualSortedLinks)
146+
}

0 commit comments

Comments
 (0)