From 131a7155fa84c9e08f1a7e44e18ad9f8c89ef526 Mon Sep 17 00:00:00 2001 From: "Sascha L. Teichmann" Date: Tue, 17 May 2022 15:58:34 +0200 Subject: [PATCH] If there are year folders in directory listings, fetch files from the extra level. --- cmd/csaf_checker/links.go | 100 ++++++++++++++++++++++++--------- cmd/csaf_checker/links_test.go | 11 +++- cmd/csaf_checker/processor.go | 4 +- 3 files changed, 85 insertions(+), 30 deletions(-) diff --git a/cmd/csaf_checker/links.go b/cmd/csaf_checker/links.go index 7f49dc3..f8e885d 100644 --- a/cmd/csaf_checker/links.go +++ b/cmd/csaf_checker/links.go @@ -12,11 +12,14 @@ import ( "io" "net/http" "net/url" + "regexp" "strings" "github.com/PuerkitoBio/goquery" ) +var yearFolder = regexp.MustCompile(`.*/?\d{4}/?$`) + func (p *processor) linksOnPageURL(baseDir string) ([]string, error) { base, err := url.Parse(baseDir) @@ -40,42 +43,87 @@ func (p *processor) linksOnPageURL(baseDir string) ([]string, error) { return nil, errContinue } - defer res.Body.Close() - - // Links may be relative - return linksOnPage(res.Body, func(link string) (string, error) { - u, err := url.Parse(link) - if err != nil { - return "", err - } - return base.ResolveReference(u).String(), nil - }) - -} - -func linksOnPage(r io.Reader, resolve func(string) (string, error)) ([]string, error) { - - doc, err := goquery.NewDocumentFromReader(r) - if err != nil { + var ( + subDirs []string + files []string + ) + if err := func() error { + defer res.Body.Close() + return linksOnPage(res.Body, func(link string) error { + u, err := url.Parse(link) + if err != nil { + return err + } + // Links may be relative + abs := base.ResolveReference(u).String() + switch { + case yearFolder.MatchString(link): + subDirs = append(subDirs, abs) + case strings.HasSuffix(link, ".json"): + files = append(files, abs) + } + return nil + }) + }(); err != nil { return nil, err } - var links []string + // If we do not have sub folders, return links from this level. + if len(subDirs) == 0 { + return files, nil + } + + // Descent into folders + for _, sub := range subDirs { + p.checkTLS(sub) + res, err := client.Get(sub) + if err != nil { + p.badDirListings.add("Fetching %s failed: %v", sub, err) + return nil, errContinue + } + if res.StatusCode != http.StatusOK { + p.badDirListings.add("Fetching %s failed. Status code %d (%s)", + base, res.StatusCode, res.Status) + return nil, errContinue + } + if err := func() error { + defer res.Body.Close() + return linksOnPage(res.Body, func(link string) error { + u, err := url.Parse(link) + if err != nil { + return err + } + // Links may be relative + abs := base.ResolveReference(u).String() + // Only collect json files in this sub folder + if strings.HasSuffix(link, ".json") { + files = append(files, abs) + } + return nil + }) + }(); err != nil { + return nil, err + } + } + + return files, nil +} + +func linksOnPage(r io.Reader, visit func(string) error) error { + + doc, err := goquery.NewDocumentFromReader(r) + if err != nil { + return err + } doc.Find("a").Each(func(_ int, s *goquery.Selection) { if err != nil { return } if link, ok := s.Attr("href"); ok { - // Only care for JSON files here. - if !strings.HasSuffix(link, ".json") { - return - } - if link, err = resolve(link); err == nil { - links = append(links, link) - } + err = visit(link) } }) - return links, err + return err } diff --git a/cmd/csaf_checker/links_test.go b/cmd/csaf_checker/links_test.go index 11d37d5..96e7e62 100644 --- a/cmd/csaf_checker/links_test.go +++ b/cmd/csaf_checker/links_test.go @@ -24,9 +24,16 @@ const page0 = ` func TestLinksOnPage(t *testing.T) { - links, err := linksOnPage( + var links []string + + err := linksOnPage( strings.NewReader(page0), - func(s string) (string, error) { return s, nil }, + func(s string) error { + if strings.HasSuffix(s, ".json") { + links = append(links, s) + } + return nil + }, ) if err != nil { t.Fatal(err) diff --git a/cmd/csaf_checker/processor.go b/cmd/csaf_checker/processor.go index 43f9aff..c6fb61d 100644 --- a/cmd/csaf_checker/processor.go +++ b/cmd/csaf_checker/processor.go @@ -252,7 +252,7 @@ func (p *processor) checkRedirect(r *http.Request, via []*http.Request) error { p.redirects[url] = path.String() if len(via) > 10 { - return errors.New("Too many redirections") + return errors.New("too many redirections") } return nil } @@ -826,7 +826,7 @@ func (p *processor) extractProviderURL(r io.Reader) (string, error) { return "", err } if len(urls) == 0 { - return "", errors.New("No provider-metadata.json found") + return "", errors.New("no provider-metadata.json found") } if len(urls) > 1 {