---- -*- Mode: rpl; -*-
---- vim:syn=rosie
----
---- net.rpl Rosie Pattern Language patterns for hostnames, ip addresses, and such
----
---- © Copyright IBM Corporation 2016, 2017.
---- LICENSE: MIT License (https://opensource.org/licenses/mit-license.html)
---- AUTHORS: Jamie A. Jennings, Kevin Zander
-- FUTURE: Refactor ipv6_mixed and ip_address_v6 for efficiency?
rpl 1.1
package net
import num
-- RFC1035 Domain Names - Implementation and Specification
-- (https://tools.ietf.org/html/rfc1035)
-- RFC1123 Requirements for Internet Hosts -- Application and Support
-- (https:--tools.ietf.org/html/rfc1123)
-- RFC3696 Application Techniques for Checking and Transformation of Names
-- (https:--tools.ietf.org/html/rfc3696)
-- RFC5952 A Recommendation for IPv6 Address Text Representation
-- (https://tools.ietf.org/html/rfc5952)
-- RFC2181 Clarifications to the DNS Specification
-- (https://tools.ietf.org/html/rfc2181#section-11)
---------------------------------------------------------------------------------------------------
-- IP ADDRESSES
---------------------------------------------------------------------------------------------------
local alias ipv4_component = [0] /
{[1] [0-9]? [0-9]?} /
{[2] [0-4] [0-9]} / {[2] [5] [0-5]} / {[2] [0-9]?} /
{[3-9] [0-9]?}
local alias ip_address_v4 = { ipv4_component {"." ipv4_component}{3} }
local alias ipv6_component = [:xdigit:]{1,4}
local alias ipv6_rest = { ":" !>ipv4 ipv6_component }
-- ipv6 per RFC5952
local alias ip_address_v6 = { ipv6_component ipv6_rest{7} } /
{ ipv6_component "::" ipv6_component ipv6_rest{0,4} } /
{ ipv6_component ipv6_rest{1} "::" ipv6_component ipv6_rest{0,3} } /
{ ipv6_component ipv6_rest{2} "::" ipv6_component ipv6_rest{0,2} } /
{ ipv6_component ipv6_rest{3} "::" ipv6_component ipv6_rest{0,1} } /
{ ipv6_component ipv6_rest{4} "::" ipv6_component } /
{ ipv6_component ipv6_rest{5} "::" } /
{ "::" ipv6_component ipv6_rest{0,5} } /
{ "::" } -- undefined address
-- A pattern for ipv6 that does NOT recognize mixed ipv4/ipv6
ipv6_strict = ip_address_v6
-- test ipv6_strict rejects "::192.9.5.5", "::FFFF:129.144.52.38"
local alias ipv6_end = {":" ipv4 / ipv6_component}
alias ipv6_mixed = { ipv6_component ipv6_rest{6} ipv6_end } /
{ ipv6_component "::" ipv6_component ipv6_rest{0,3} ipv6_end} /
{ ipv6_component ipv6_rest{1} "::" ipv6_component ipv6_rest{0,2} ipv6_end} /
{ ipv6_component ipv6_rest{2} "::" ipv6_component ipv6_rest{0,1} ipv6_end} /
{ ipv6_component ipv6_rest{3} "::" ipv6_component ipv6_end } /
{ ipv6_component ipv6_rest{4} "::" ipv4 } /
{ "::" ipv6_component ipv6_rest{0,4} ipv6_end} /
{ "::" ipv4 }
ipv4 = ip_address_v4
-- test ipv4 accepts "0.0.0.0", "1.2.234.123",
-- test ipv4 rejects "1234.1.2.3", "1.2.3", "111.222.333.", "111.222.333..444"
-- test ipv4 rejects "999.999.999.999", "256.1.1.1", "1.256.1.1", "1.1.256.1", "1.1.1.256"
ipv6 = ipv6_mixed / ip_address_v6
-- test ipv6 includes ipv4 "::192.9.5.5", "::FFFF:129.144.52.38"
-- test ipv6 excludes ipv4 "1080::8:800:200C:417A", "2010:836B:4179::836B:4179"
-- test ipv6 accepts "::", "::1", "::face:b00c"
-- test ipv6 accepts "2001:0db8:0000:0000:0000:ff00:0042:8329", "2001:db8:0:0:0:ff00:42:8329", "2001:db8::ff00:42:8329"
-- test ipv6 accepts "FEDC:BA98:7654:3210:FEDC:BA98:7654:3210", "1080:0:0:0:8:800:200C:4171", "3ffe:2a00:100:7031::1"
-- test ipv6 accepts "1080::8:800:200C:417A", "::192.9.5.5", "::FFFF:129.144.52.38", "2010:836B:4179::836B:4179"
-- TODO: add some rejecting tests
ip = ipv4 / ipv6
-- the above tests validate ip
-- test ip includes ipv4 "1.2.3.4"
-- test ip includes ipv6 "::192.9.5.5"
-- test ip includes ipv4 "::192.9.5.5"
---------------------------------------------------------------------------------------------------
-- DOMAIN NAMES
---------------------------------------------------------------------------------------------------
-- Notes:
-- (1) RFC1035 specifies a grammar for domain names that is MORE RESTRICTIVE than the host names
-- defined in the URI specification, RFC3986.
-- (2) RFC2181 (section 11) says that domain names submitted as queries to DNS can contain any
-- characters. Domain names that can be used as hostnames are a subset of the names that could
-- be submitted to DNS, and so our domain name pattern does NOT allow any characters at all.
-- (3) We define the pattern 'fqdn' to match hostnames that contain AT LEAST one dot (.) even though
-- such names may actually be only partially qualified. A better name for this pattern would be
-- 'pqdn' but this is not an easily recognized abbreviation, whereas 'fqdn' is.
port = [0-9]+
local alias port_spec = { ":" port }
local alias letter = [[A-Z][a-z]]
local alias let_dig = letter / [0-9]
local alias let_dig_hyp = let_dig / {"-" let_dig}
local alias subdomain = { let_dig let_dig_hyp{0,62} } -- Per RFC1123, can start with digit now
-- Note: An unsigned decimal number like "2.3" can also be a qualified domain name, but it would be
-- an unusual one. The pattern fqdn_strict follows the spec and will accept "2.3". The pattern
-- "fqdn_practical" will reject input matching an unsigned decimal like "2.3". Also, fqdn_practical
-- will similarly reject "1.2.3".
alias fqdn_strict_alias = { subdomain {"." subdomain}+ "."?}
fqdn_strict = fqdn_strict_alias
alias fqdn_practical_alias = { !{num.float !let_dig_hyp} fqdn_strict_alias }
fqdn_practical = fqdn_practical_alias
fqdn = { {fqdn_practical_alias / subdomain} port_spec? }
-- test fqdn accepts "a.edu", "A.BC.EDU", "X-Y.CS.CORNELL.EDU", "SRI-NIC.ARPA"
-- test fqdn accepts "ibm.com.", "9in.nails.band", "1AAA.SRI-NIC.ARPA"
-- test fqdn accepts "ibm.com:443", "ibm.com.:80"
-- test fqdn rejects ".EDU", "XY-.CS.CORNELL.EDU",
-- test fqdn rejects "ibm.com:", "ibm.com:x"
-- test fqdn_strict rejects "ibm.com:443"
-- test fqdn_strict rejects "a", "abc", "ZZZZZZ", "Z-9"
-- test fqdn_strict accepts "1.2", "3.1415926536", "1.2.", "1.2.3", "1.2.3."
-- test fqdn_practical rejects "1.2", "3.1415926536", "1.2.", "1.2+ ", "-3.14", "6.02e23"
-- test fqdn_practical rejects "1.2.3", "1.2.3."
---------------------------------------------------------------------------------------------------
-- EMAIL ADDRESSES
---------------------------------------------------------------------------------------------------
-- Notes:
-- (1) Per RFC3696, the name part of an email address has some minor restrictions, such as that a
-- dot "." cannot be the last character of the name, which are not enforced here.
-- (2) Per RFC3696, any ASCII character may be part of the name as long as it is escaped with a
-- backslash. The escaped form is NOT SUPPORTED IN the 'name' pattern below, as it is rarely seen
-- in practice (and subject to debate in the Errata).
local alias name_char = { [:alnum:] / [!#$%&'*+\-/=?\^_`.{|}~] }
local alias unquoted_name = { name_char {name_char / "."}* }
local alias quoted_name = { ["] [^"]+ ["] } -- " cannot be inside the name
name = quoted_name / unquoted_name
email = { name "@" host }
-- test email accepts "me@here.com", "you+filter@somewhere.org", "k!!{}@example.com", "9name@gmail.com"
-- test email accepts "customer/department=shipping@example.com", "$A12345@example.com"
-- test email accepts "!def!xyz%abc@example.com", "_somename@example.com", "\"Joe.\\Blow\"@example.com"
-- test email accepts "\"John Doe\"@example.com", "\"Can put @ signs here!\"@example.com"
-- test email rejects "here.com", "foo@bar@example.com"
---------------------------------------------------------------------------------------------------
-- URI (https://tools.ietf.org/html/rfc3986)
---------------------------------------------------------------------------------------------------
-- Notes:
-- (1) The "future ip literal" is not implemented (page 19, https://tools.ietf.org/html/rfc3986)
-- (2) Relative URIs of the form "/foo/bar.txt" will match 'path' but not 'uri'
local alias sub_delims = [!$&'()*+,;=]
local alias gen_delims = [:/?#\[\]@]
local alias pct_encoded = { "%" [[0-9][A-F][a-f]]{2} }
local alias unreserved = [[:alnum:][\-._~]]
local alias pchar = unreserved / pct_encoded / sub_delims / ":" / "@"
local alias regname = { [:alnum:] {unreserved / pct_encoded / sub_delims}* }
registered_name = { regname {"." regname}* "."? }
ip_literal = { "[" ipv6 "]" }
host = ip_literal / ipv4 / registered_name
-- test host accepts "a", "abc", "ZZZZZZ", "Z-9"
-- test host accepts "a.edu", "A.BC.EDU", "X-Y.CS.CORNELL.EDU", "SRI-NIC.ARPA"
-- test host accepts "ibm.com.", "9in.nails.band", "1AAA.SRI-NIC.ARPA"
-- test host accepts "XY-.CS.CORNELL.EDU" (not a valid domain name, but a valid URI hostname)
-- test host accepts "[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]", "[1080:0:0:0:8:800:200C:417A]"
-- test host accepts "[3ffe:2a00:100:7031::1]"
-- test host accepts "[1080::8:800:200C:417A]", "[::192.9.5.5]", "[::FFFF:129.144.52.38]", "[2010:836B:4179::836B:4179]"
-- test host rejects "ibm.com:443", "example.com.:80"
-- test host rejects ".EDU"
-- test host rejects "ibm.com:", "ibm.com:x"
userinfo = { unreserved / pct_encoded / sub_delims / ":" }*
authority = { { userinfo "@" }? {host port_spec?}? }
-- test authority accepts ""
local alias segment_nz_nc = { unreserved / pct_encoded / sub_delims / "@" }+
local alias segment_nz = pchar+
local alias segment = pchar*
local alias path_empty = ""
local alias path_rootless = { segment_nz { "/" segment }* }
local alias path_noscheme = { segment_nz_nc { "/" segment }* }
local alias path_absolute = { "/" { segment_nz { "/" segment }* }* }
local alias path_abempty = { "/" segment }*
path = { "/" segment }+ / { segment_nz { "/" segment }+ }
-- test path accepts "/other/link.html" (Sometimes called a "relative URL")
-- test path accepts "/", "/a/b", "a/b", "a.b/", "a/b/c.d"
-- test path rejects "a", "a.b" (no slashes)
-- This is called 'hier_part' in the IETF specifications. We chose a more informative name,
-- indicating that this part of a URI contains the authority (if present) and path (if present).
authpath = { "//" authority path_abempty } /
path_absolute /
path_rootless
local fragment = { pchar / "/" / "?" }*
local query = { pchar / "/" / "?" }*
scheme = { [:alpha:] [[:alnum:][+.\-]]* }
uri = { scheme ":" authpath? { "?" query }? { "#" fragment }? }
-- test uri accepts "ftp://ftp.is.co.za/rfc/rfc1808.txt"
-- test uri accepts "http://www.ietf.org/rfc/rfc2396.txt"
-- test uri accepts "ldap://[2001:db8::7]/c=GB?objectClass?one"
-- test uri accepts "mailto:John.Doe@example.com"
-- test uri accepts "news:comp.infosystems.www.servers.unix"
-- test uri accepts "tel:+1-816-555-1212"
-- test uri accepts "telnet://192.0.2.16:80/"
-- test uri accepts "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"
-- test uri accepts "http://www.google.com", "http://google.com/"
-- test uri accepts "https://www.gitlab.com/rosie-pattern-language"
-- test uri accepts "ftp://some.ftp.net/path/to/file.zip"
-- test uri accepts "http://example.com/mypage.html", "ftp://example.com/download.zip"
-- test uri accepts "mailto:user@example.com", "file:///home/user/file.txt", "tel:1-888-555-5555"
-- test uri accepts "telnet://192.0.2.16:80/", "http://example.com/resource?foo=bar#fragment"
-- test uri accepts "foo:"
-- test uri rejects "foo"
---------------------------------------------------------------------------------------------------
-- URL
---------------------------------------------------------------------------------------------------
-- "The term 'URL' does not refer to a formal partition of URI space; rather, URL is a useful but
-- informal concept" (http://www.w3.org/TR/uri-clarification).
--
-- We will take advantage of the informality to use the name 'url' to refer to a uri that requires
-- an authpath. A uri will match "foo:" but a url will not.
--
-- Another point of difference between URI and URL is that we define our url pattern to reject input
-- of the form "xyz::w".
--
-- Finally, we define url_strict to follow the rules we have stated thus far. We call it
-- url_strict because, using its strict definition, it will match many strings that do not look like
-- typical URLs. The pattern url will match URLs as they are commonly found.
url_strict = { scheme ":" !":" authpath { "?" query }? { "#" fragment }? }
url = { scheme ":" >"//" authpath { "?" query }? { "#" fragment }? }
-- %% Change from v1.0.0: The Rosie v1.0.0 net.url has been renamed to net.url_strict in order to
-- %% follow the naming convention established already by fqdn_strict and ipv6_strict.
url_common = url -- Backwards compatibility with v1.0.0
-- test url_strict accepts "https://rosie-lang.org", "ftp://example.net/Rosie/The/Riveter#WorldWar2"
-- test url_strict accepts "a:b", "a:b?c", "a:b?c#d"
-- test url_strict accepts "bold:net.*=red:net.ipv6=red;"
-- test url_strict rejects "foo:", "xyz::w"
-- test url accepts "https://rosie-lang.org", "ftp://example.net/Rosie/The/Riveter#WorldWar2"
-- test url rejects "a:b", "a:b?c", "a:b?c#d"
-- test url rejects "bold:net.*=red:net.ipv6=red;"
-- test url rejects "foo:", "xyz::w"
---------------------------------------------------------------------------------------------------
-- HTTP commands
---------------------------------------------------------------------------------------------------
-- Some very simple HTTP patterns
http_command_name = "GET" / "HEAD" / "PUT" / "POST" / "DELETE" /
"TRACE" / "OPTIONS" / "CONNECT" / "PATCH"
http_command = http_command_name url
http_version = {"HTTP" "/" [:digit:]+ "." [:digit:]+} -- e.g. "HTTP1.1"
---------------------------------------------------------------------------------------------------
-- MAC addresses
---------------------------------------------------------------------------------------------------
MAC_cisco = { {[:xdigit:]{4} "."}{2} [:xdigit:]{4} }
MAC_windows = { {[:xdigit:]{2} "-"}{5} [:xdigit:]{2} }
MAC_common = { {[:xdigit:]{2} ":"}{5} [:xdigit:]{2} }
MAC = MAC_cisco / MAC_windows / MAC_common
-- TODO: MAC needs tests
---------------------------------------------------------------------------------------------------
-- net.any will match common network patterns (but not all the patterns defined above!)
---------------------------------------------------------------------------------------------------
-- Match an IP, Host, Email, URL
any = ip / MAC / fqdn_practical / email / url / path