1
0
Fork 0
mirror of https://github.com/gocsaf/csaf.git synced 2025-12-22 11:55:40 +01:00

Improve joining of url paths in some situations

* Use url.JoinPath to join URLs from a few places.
* Add util/joinpath.go from go 1.19, add the license in REUSE 3.0 compatible manner.

resolve #223

Co-authored-by: Bernhard Reiter <bernhard@intevation.de>
This commit is contained in:
Sascha L. Teichmann 2022-07-18 17:41:52 +02:00 committed by GitHub
parent 324de3abca
commit 9cba4eec30
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 388 additions and 3 deletions

301
util/joinpath.go Normal file
View file

@ -0,0 +1,301 @@
// SPDX-License-Identifier: LicenseRef-Go119-BSD-Patentgrant
// SPDX-FileCopyrightText: 2009 The Go Authors, Google Inc.
// The code of this file was extracted and adjusted from
// https://cs.opensource.google/go/go/+/refs/tags/go1.19rc2:src/net/url/url.go
// by Intevation 2022
//go:build !go1.19
package util
import (
"net/url"
"path"
"strings"
)
type encoding int
const (
encodePath encoding = 1 + iota
encodePathSegment
encodeHost
encodeZone
encodeUserPassword
encodeQueryComponent
encodeFragment
)
const upperhex = "0123456789ABCDEF"
func ishex(c byte) bool {
switch {
case '0' <= c && c <= '9':
return true
case 'a' <= c && c <= 'f':
return true
case 'A' <= c && c <= 'F':
return true
}
return false
}
func unhex(c byte) byte {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
}
return 0
}
// Return true if the specified character should be escaped when
// appearing in a URL string, according to RFC 3986.
//
// Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684.
func shouldEscape(c byte, mode encoding) bool {
// §2.3 Unreserved characters (alphanum)
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
return false
}
if mode == encodeHost || mode == encodeZone {
// §3.2.2 Host allows
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// as part of reg-name.
// We add : because we include :port as part of host.
// We add [ ] because we include [ipv6]:port as part of host.
// We add < > because they're the only characters left that
// we could possibly allow, and Parse will reject them if we
// escape them (because hosts can't use %-encoding for
// ASCII bytes).
switch c {
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
return false
}
}
switch c {
case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
return false
case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
// Different sections of the URL allow a few of
// the reserved characters to appear unescaped.
switch mode {
case encodePath: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments. This package
// only manipulates the path as a whole, so we allow those
// last three as well. That leaves only ? to escape.
return c == '?'
case encodePathSegment: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments.
return c == '/' || c == ';' || c == ',' || c == '?'
case encodeUserPassword: // §3.2.1
// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
// userinfo, so we must escape only '@', '/', and '?'.
// The parsing of userinfo treats ':' as special so we must escape
// that too.
return c == '@' || c == '/' || c == '?' || c == ':'
case encodeQueryComponent: // §3.4
// The RFC reserves (so we must escape) everything.
return true
case encodeFragment: // §4.1
// The RFC text is silent but the grammar allows
// everything, so escape nothing.
return false
}
}
if mode == encodeFragment {
// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
// need to be escaped. To minimize potential breakage, we apply two restrictions:
// (1) we always escape sub-delims outside of the fragment, and (2) we always
// escape single quote to avoid breaking callers that had previously assumed that
// single quotes would be escaped. See issue #19917.
switch c {
case '!', '(', ')', '*':
return false
}
}
// Everything else must be escaped.
return true
}
// unescape unescapes a string; the mode specifies
// which section of the URL string is being unescaped.
func unescape(s string, mode encoding) (string, error) {
// Count %, check that they're well-formed.
n := 0
hasPlus := false
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", url.EscapeError(s)
}
// Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used
// for non-ASCII bytes.
// But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay.
if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
return "", url.EscapeError(s[i : i+3])
}
if mode == encodeZone {
// RFC 6874 says basically "anything goes" for zone identifiers
// and that even non-ASCII can be redundantly escaped,
// but it seems prudent to restrict %-escaped bytes here to those
// that are valid host name bytes in their unescaped form.
// That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly.
// But Windows puts spaces here! Yay.
v := unhex(s[i+1])<<4 | unhex(s[i+2])
if s[i:i+3] != "%25" && v != ' ' && shouldEscape(v, encodeHost) {
return "", url.EscapeError(s[i : i+3])
}
}
i += 3
case '+':
hasPlus = mode == encodeQueryComponent
i++
default:
if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
return "", url.InvalidHostError(s[i : i+1])
}
i++
}
}
if n == 0 && !hasPlus {
return s, nil
}
var t strings.Builder
t.Grow(len(s) - 2*n)
for i := 0; i < len(s); i++ {
switch s[i] {
case '%':
t.WriteByte(unhex(s[i+1])<<4 | unhex(s[i+2]))
i += 2
case '+':
if mode == encodeQueryComponent {
t.WriteByte(' ')
} else {
t.WriteByte('+')
}
default:
t.WriteByte(s[i])
}
}
return t.String(), nil
}
func escape(s string, mode encoding) string {
spaceCount, hexCount := 0, 0
for i := 0; i < len(s); i++ {
c := s[i]
if shouldEscape(c, mode) {
if c == ' ' && mode == encodeQueryComponent {
spaceCount++
} else {
hexCount++
}
}
}
if spaceCount == 0 && hexCount == 0 {
return s
}
var buf [64]byte
var t []byte
required := len(s) + 2*hexCount
if required <= len(buf) {
t = buf[:required]
} else {
t = make([]byte, required)
}
if hexCount == 0 {
copy(t, s)
for i := 0; i < len(s); i++ {
if s[i] == ' ' {
t[i] = '+'
}
}
return string(t)
}
j := 0
for i := 0; i < len(s); i++ {
switch c := s[i]; {
case c == ' ' && mode == encodeQueryComponent:
t[j] = '+'
j++
case shouldEscape(c, mode):
t[j] = '%'
t[j+1] = upperhex[c>>4]
t[j+2] = upperhex[c&15]
j += 3
default:
t[j] = s[i]
j++
}
}
return string(t)
}
func setPath(u *url.URL, p string) error {
path, err := unescape(p, encodePath)
if err != nil {
return err
}
u.Path = path
if escp := escape(path, encodePath); p == escp {
// Default encoding is fine.
u.RawPath = ""
} else {
u.RawPath = p
}
return nil
}
// JoinURLPath returns a new URL with the provided path elements joined to
// any existing path and the resulting path cleaned of any ./ or ../ elements.
// Any sequences of multiple / characters will be reduced to a single /.
func JoinURLPath(u *url.URL, elem ...string) *url.URL {
url := *u
if len(elem) > 0 {
elem = append([]string{u.EscapedPath()}, elem...)
p := path.Join(elem...)
// path.Join will remove any trailing slashes.
// Preserve at least one.
if strings.HasSuffix(elem[len(elem)-1], "/") && !strings.HasSuffix(p, "/") {
p += "/"
}
setPath(&url, p)
}
return &url
}