1
0
Fork 0
mirror of https://github.com/gocsaf/csaf.git synced 2025-12-22 18:15:42 +01:00

If there are year folders in directory listings, fetch files from the extra level.

This commit is contained in:
Sascha L. Teichmann 2022-05-17 15:58:34 +02:00
parent 29f26e0299
commit 131a7155fa
3 changed files with 85 additions and 30 deletions

View file

@ -12,11 +12,14 @@ import (
"io"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
)
var yearFolder = regexp.MustCompile(`.*/?\d{4}/?$`)
func (p *processor) linksOnPageURL(baseDir string) ([]string, error) {
base, err := url.Parse(baseDir)
@ -40,42 +43,87 @@ func (p *processor) linksOnPageURL(baseDir string) ([]string, error) {
return nil, errContinue
}
var (
subDirs []string
files []string
)
if err := func() error {
defer res.Body.Close()
// Links may be relative
return linksOnPage(res.Body, func(link string) (string, error) {
return linksOnPage(res.Body, func(link string) error {
u, err := url.Parse(link)
if err != nil {
return "", err
return err
}
return base.ResolveReference(u).String(), nil
// Links may be relative
abs := base.ResolveReference(u).String()
switch {
case yearFolder.MatchString(link):
subDirs = append(subDirs, abs)
case strings.HasSuffix(link, ".json"):
files = append(files, abs)
}
return nil
})
}
func linksOnPage(r io.Reader, resolve func(string) (string, error)) ([]string, error) {
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
}(); err != nil {
return nil, err
}
var links []string
// If we do not have sub folders, return links from this level.
if len(subDirs) == 0 {
return files, nil
}
// Descent into folders
for _, sub := range subDirs {
p.checkTLS(sub)
res, err := client.Get(sub)
if err != nil {
p.badDirListings.add("Fetching %s failed: %v", sub, err)
return nil, errContinue
}
if res.StatusCode != http.StatusOK {
p.badDirListings.add("Fetching %s failed. Status code %d (%s)",
base, res.StatusCode, res.Status)
return nil, errContinue
}
if err := func() error {
defer res.Body.Close()
return linksOnPage(res.Body, func(link string) error {
u, err := url.Parse(link)
if err != nil {
return err
}
// Links may be relative
abs := base.ResolveReference(u).String()
// Only collect json files in this sub folder
if strings.HasSuffix(link, ".json") {
files = append(files, abs)
}
return nil
})
}(); err != nil {
return nil, err
}
}
return files, nil
}
func linksOnPage(r io.Reader, visit func(string) error) error {
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return err
}
doc.Find("a").Each(func(_ int, s *goquery.Selection) {
if err != nil {
return
}
if link, ok := s.Attr("href"); ok {
// Only care for JSON files here.
if !strings.HasSuffix(link, ".json") {
return
}
if link, err = resolve(link); err == nil {
links = append(links, link)
}
err = visit(link)
}
})
return links, err
return err
}

View file

@ -24,9 +24,16 @@ const page0 = `<html>
func TestLinksOnPage(t *testing.T) {
links, err := linksOnPage(
var links []string
err := linksOnPage(
strings.NewReader(page0),
func(s string) (string, error) { return s, nil },
func(s string) error {
if strings.HasSuffix(s, ".json") {
links = append(links, s)
}
return nil
},
)
if err != nil {
t.Fatal(err)

View file

@ -252,7 +252,7 @@ func (p *processor) checkRedirect(r *http.Request, via []*http.Request) error {
p.redirects[url] = path.String()
if len(via) > 10 {
return errors.New("Too many redirections")
return errors.New("too many redirections")
}
return nil
}
@ -826,7 +826,7 @@ func (p *processor) extractProviderURL(r io.Reader) (string, error) {
return "", err
}
if len(urls) == 0 {
return "", errors.New("No provider-metadata.json found")
return "", errors.New("no provider-metadata.json found")
}
if len(urls) > 1 {