nws/protocol/domain.go
2024-11-17 22:02:50 +01:00

383 lines
12 KiB
Go

package protocol
import (
"errors"
"fmt"
"net"
"net/url"
"strings"
"golang.org/x/net/idna"
"golang.org/x/net/publicsuffix"
)
const (
ipV6URINotationPrefix = "["
ipV6URINotationSuffix = "]"
)
var ErrEmptyURL = errors.New("url to be parsed is empty")
// URL represents a URL with additional fields and methods.
type URL struct {
SubName, Name, TLD, Port string
IsDomain bool
*url.URL
}
// String returns the string representation of the URL.
// It includes the scheme if `includeScheme` is true.
func (url URL) String(includeScheme bool) string {
s := url.URL.String()
if !includeScheme {
s = RemoveScheme(s)
}
return s
}
// Domain returns the domain name of the URL. If includeSub is true and there is a subdomain, it includes the subdomain
// in the returned string. Otherwise, it only includes the domain.
func (url URL) Domain(includeSub bool) string {
if includeSub && url.SubName != "" {
return fmt.Sprintf("%s.%s.%s", url.SubName, url.Name, url.TLD)
}
return fmt.Sprintf("%s.%s", url.Name, url.TLD)
}
// NoWWW returns the domain name without the "www" subdomain.
// If the subdomain is not "www" or is empty, it returns the domain name as is.
// The returned domain name is a string in the format "subname.name.tld".
func (url URL) NoWWW() string {
if url.SubName != "www" && url.SubName != "" {
return fmt.Sprintf("%s.%s.%s", url.SubName, url.Name, url.TLD)
}
return fmt.Sprintf("%s.%s", url.Name, url.TLD)
}
// WWW returns the domain name with the "www" subdomain.
// If the subdomain is not "www", it returns the domain name as is.
// The returned domain name is a string in the format "subname.name.tld".
func (url URL) WWW() string {
if url.SubName != "" {
return fmt.Sprintf("%s.%s.%s", url.SubName, url.Name, url.TLD)
}
return fmt.Sprintf("%s.%s.%s", "www", url.Name, url.TLD)
}
// HTTPS returns the URL with HTTPS Scheme but leaves the URL itself untouched.
func (url URL) HTTPS() string {
rememberScheme := url.Scheme
url.Scheme = "https"
httpsURL := url.String(true)
url.Scheme = rememberScheme
return httpsURL
}
// StripWWW returns the URL without "www" subdomain, but leaves the URL itself untouched.
// This function returns the whole URL with its path, in contrast to NoWWW().
func (url URL) StripWWW(includeScheme bool) string {
if url.SubName == "www" {
return strings.Replace(url.String(includeScheme), "www.", "", 1)
}
return url.String(includeScheme)
}
// StripQueryParams removes query parameters and fragments from the URL and returns
// the URL as a string. If includeScheme is true, it includes the scheme in the returned URL.
func (url URL) StripQueryParams(includeScheme bool) string {
// Remember the original values of query parameters and fragments
rememberRawQuery := url.RawQuery
rememberFragment := url.Fragment
rememberRawFragment := url.RawFragment
// Clear the query parameters and fragments
url.RawQuery = ""
url.RawFragment = ""
url.Fragment = ""
// Get the URL without query parameters
urlWithoutQuery := url.String(includeScheme)
// Restore the original values of query parameters and fragments
url.RawQuery = rememberRawQuery
url.RawFragment = rememberRawFragment
url.Fragment = rememberFragment
return urlWithoutQuery
}
// IsLocal checks if the URL is a local address.
// It returns true if the URL's top-level domain (TLD) is "localhost" or if the URL's
// hostname resolves to a loopback IP address.
func (url URL) IsLocal() bool {
ip := net.ParseIP(strings.TrimPrefix(strings.TrimSuffix(url.Name, ipV6URINotationSuffix), ipV6URINotationPrefix))
return url.TLD == "localhost" || (ip != nil && ip.IsLoopback())
}
// Parse parses a string representation of a URL and returns a *URL and error.
// It mirrors the net/url.Parse function but returns a tld.URL, which contains extra fields.
func Parse(urlString string) (*URL, error) {
urlString = strings.TrimSpace(urlString)
// if the url to be parsed is empty after trimming, we return an error
if len(urlString) == 0 {
return nil, ErrEmptyURL
}
urlString = AddDefaultScheme(urlString)
parsedURL, err := url.Parse(urlString)
if err != nil {
return nil, fmt.Errorf("could not parse url: %w", err)
}
// always lowercase subdomain.domain.tld (host property)
parsedURL.Host = strings.ToLower(parsedURL.Host)
if parsedURL.Host == "" {
return &URL{URL: parsedURL}, nil
}
dom, port := domainPort(parsedURL.Host)
var domName, tld, sub string
ip := net.ParseIP(strings.TrimPrefix(strings.TrimSuffix(dom, ipV6URINotationSuffix), ipV6URINotationPrefix))
switch {
case ip != nil:
domName = dom
case dom == "localhost":
tld = dom
default:
etld1, err := publicsuffix.EffectiveTLDPlusOne(dom)
if err != nil {
return nil, fmt.Errorf("failed to extract eTLD+1: %w", err)
}
i := strings.Index(etld1, ".")
domName = etld1[0:i]
tld = etld1[i+1:]
sub = ""
if rest := strings.TrimSuffix(dom, "."+etld1); rest != dom {
sub = rest
}
}
urlString, err = idna.ToASCII(dom)
if err != nil {
return nil, fmt.Errorf("failed to convert domain to ASCII: %w", err)
}
return &URL{
SubName: sub,
Name: domName,
TLD: tld,
Port: port,
URL: parsedURL,
IsDomain: IsDomainName(urlString),
}, nil
}
// FromParsed mirrors the net/url.Parse function,
// but instead of returning a *url.URL, it returns a *URL,
// which is a struct that contains additional fields.
//
// The function first checks if the parsedUrl.Host field is empty.
// If it is empty, it returns a *URL with the URL field set to parsedUrl
// and all other fields set to their zero values.
//
// If the parsedUrl.Host field is not empty, it extracts the domain and port
// using the domainPort function.
//
// It then calculates the effective top-level domain plus one (etld+1)
// using the publicsuffix.EffectiveTLDPlusOne function.
//
// The etld+1 is then split into the domain name (domName) and the top-level domain (tld).
//
// It further determines the subdomain (sub) by checking if the domain is a subdomain of the etld+1.
//
// The domain name (domName) is then converted to ASCII using the idna.ToASCII function.
//
// Finally, it returns a *URL with the extracted values and the URL field set to parsedUrl.
// The IsDomain field is set to the result of the IsDomainName function called with the ASCII domain name.
// The SubName field is set to sub, the Name field is set to domName, and the T.
func FromParsed(parsedURL *url.URL) (*URL, error) {
if parsedURL.Host == "" {
return &URL{URL: parsedURL}, nil
}
dom, port := domainPort(parsedURL.Host)
// etld+1
etld1, err := publicsuffix.EffectiveTLDPlusOne(dom)
if err != nil {
return nil, fmt.Errorf("failed to extract eTLD+1: %w", err)
}
// convert to domain name, and tld
i := strings.Index(etld1, ".")
domName := etld1[0:i]
tld := etld1[i+1:]
// and subdomain
sub := ""
if rest := strings.TrimSuffix(dom, "."+etld1); rest != dom {
sub = rest
}
asciiDom, err := idna.ToASCII(dom)
if err != nil {
return nil, fmt.Errorf("failed to convert domain to ASCII: %w", err)
}
return &URL{
SubName: sub,
Name: domName,
TLD: tld,
Port: port,
URL: parsedURL,
IsDomain: IsDomainName(asciiDom),
}, nil
}
// domainPort extracts the domain and port from the host part of a URL.
// If the host contains a port, it returns the domain without the port and the port as strings.
// If the host does not contain a port, it returns the domain and an empty string for the port.
// If the host is all numeric characters, it returns the host itself and an empty string for the port.
// Note that the net/url package should prevent the string from being all numeric characters.
func domainPort(host string) (string, string) {
for i := len(host) - 1; i >= 0; i-- {
if host[i] == ':' {
return host[:i], host[i+1:]
} else if host[i] < '0' || host[i] > '9' {
return host, ""
}
}
// will only land here if the string is all digits,
// net/url should prevent that from happening
return host, ""
}
// IsDomainName checks if a string represents a valid domain name.
//
// It follows the rules specified in RFC 1035 and RFC 3696 for domain name validation.
//
// The input string is first processed with the RemoveScheme function to remove any scheme prefix.
// The domain name is then split into labels using the dot separator.
// The function checks that the number of labels is at least 2 and that the total length of the string is between 1 and
// 254 characters.
//
// The function iterates over the characters of the string and performs checks based on the character type.
// Valid characters include letters (a-zA-Z), digits (0-9), underscore (_), and hyphen (-).
// Each label can contain up to 63 characters and the last label cannot end with a hyphen.
// The function also checks that the byte before a dot or a hyphen is not a dot or a hyphen, respectively.
// Non-numeric characters are tracked to ensure the presence of at least one non-numeric character in the domain name.
//
// If any of the checks fail, the function returns false. Otherwise, it returns true.
//
// Example usage:
// s := "mail.google.com"
// isValid := IsDomainName(s).
func IsDomainName(name string) bool { //nolint:cyclop
name = RemoveScheme(name)
// See RFC 1035, RFC 3696.
// Presentation format has dots before every label except the first, and the
// terminal empty label is optional here because we assume fully-qualified
// (absolute) input. We must therefore reserve space for the first and last
// labels' length octets in wire format, where they are necessary and the
// maximum total length is 255.
// So our _effective_ maximum is 253, but 254 is not rejected if the last
// character is a dot.
split := strings.Split(name, ".")
// Need a TLD and a domain.
if len(split) < 2 { //nolint:mnd
return false
}
l := len(name)
if l == 0 || l > 254 || l == 254 && name[l-1] != '.' {
return false
}
last := byte('.')
nonNumeric := false // true once we've seen a letter or hyphen
partlen := 0
for i := 0; i < len(name); i++ {
char := name[i]
switch {
default:
return false
case 'a' <= char && char <= 'z' || 'A' <= char && char <= 'Z' || char == '_':
nonNumeric = true
partlen++
case '0' <= char && char <= '9':
// fine
partlen++
case char == '-':
// Byte before dash cannot be dot.
if last == '.' {
return false
}
partlen++
nonNumeric = true
case char == '.':
// Byte before dot cannot be dot, dash.
if last == '.' || last == '-' {
return false
}
if partlen > 63 || partlen == 0 {
return false
}
partlen = 0
}
last = char
}
if last == '-' || partlen > 63 {
return false
}
return nonNumeric
}
// RemoveScheme removes the scheme from a URL string.
// If the URL string includes a scheme (e.g., "http://"),
// the scheme will be removed and the remaining string will be returned.
// If the URL string includes a default scheme (e.g., "//"),
// the default scheme will be removed and the remaining string will be returned.
// If the URL string does not include a scheme, the original string will be returned unchanged.
func RemoveScheme(s string) string {
if strings.Contains(s, "://") {
return removeScheme(s)
}
if strings.Contains(s, "//") {
return removeDefaultScheme(s)
}
return s
}
// add default scheme if string does not include a scheme.
func AddDefaultScheme(s string) string {
if !strings.Contains(s, "//") ||
(!strings.Contains(s, "//") && !strings.Contains(s, ":") && !strings.Contains(s, "@")) {
return addDefaultScheme(s)
}
return s
}
func AddScheme(s, scheme string) string {
if scheme == "" {
return AddDefaultScheme(s)
}
if strings.Index(s, "//") == -1 {
return fmt.Sprintf("%s://%s", scheme, s)
}
return s
}
// addDefaultScheme returns a new string with a default scheme added.
// The default scheme format is "//<original_string>".
func addDefaultScheme(s string) string {
return fmt.Sprintf("//%s", s)
}
// removeDefaultScheme removes the default scheme from a string.
func removeDefaultScheme(s string) string {
return s[index(s, "//"):]
}
func removeScheme(s string) string {
return s[index(s, "://"):]
}
// index returns the starting index of the first occurrence of the specified scheme in the given string.
// If the scheme is not found, it returns -1.
// The returned int is incremented by the length of the scheme to obtain the starting position of the remaining string.
func index(s, scheme string) int {
return strings.Index(s, scheme) + len(scheme)
}