Skip to content

Commit

Permalink
Show file tree
Hide file tree
Showing 4 changed files with 277 additions and 73 deletions.
27 changes: 24 additions & 3 deletions src/core/s-mold.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,22 +474,38 @@ STOID Mold_Issue(REBVAL *value, REB_MOLD *mold)
STOID Mold_Url(REBVAL *value, REB_MOLD *mold)
{
REBUNI *dp;
REBCNT n;
REBCNT n, i;
REBUNI c;
REBCNT len = VAL_LEN(value);
REBSER *ser = VAL_SERIES(value);
REBYTE buf[10];
REBCNT ulen;

// Compute extra space needed for hex encoded characters:
for (n = VAL_INDEX(value); n < VAL_TAIL(value); n++) {
c = GET_ANY_CHAR(ser, n);
if (IS_URL_ESC(c)) len += 2;
// unicode chars must be also encoded...
else if (c < (REBCNT)0x80) continue;
else if (c >= (REBCNT)0x0010FFFF) len += 14;
else if (c >= (REBCNT)0x10000) len += 11;
else if (c >= (REBCNT)0x800) len += 8;
else if (c >= (REBCNT)0x80) len += 5;
}

dp = Prep_Uni_Series(mold, len);

for (n = VAL_INDEX(value); n < VAL_TAIL(value); n++) {
c = GET_ANY_CHAR(ser, n);
if (IS_URL_ESC(c)) dp = Form_Hex_Esc_Uni(dp, c); // c => %xx
else if (c >= 0x80) {
// to avoid need to first convert whole url to utf8,
// use the temp buffer for any unicode char...
ulen = Encode_UTF8_Char(&buf, c);
for (i = 0; i < ulen; i++) {
dp = Form_Hex_Esc_Uni(dp, (REBUNI)buf[i]);
}
}
else *dp++ = c;
}

Expand Down Expand Up @@ -1570,12 +1586,17 @@ STOID Mold_Error(REBVAL *value, REB_MOLD *mold, REBFLG molded)
Char_Escapes[LF] = '/';
Char_Escapes['"'] = '"';
Char_Escapes['^'] = '^';

URL_Escapes = cp = Make_Mem(MAX_URL_CHAR+1); // cleared
//for (c = 0; c <= MAX_URL_CHAR; c++) if (IS_LEX_DELIMIT(c)) cp[c] = ESC_URL;
// escape all chars from #"^(00)" to #"^(20)"
for (c = 0; c <= ' '; c++) cp[c] = ESC_URL | ESC_FILE;
// and also all chars which are a lexer delimiters
dc = b_cast(";%\"()[]{}<>");
for (c = (REBYTE)LEN_BYTES(dc); c > 0; c--) URL_Escapes[*dc++] = ESC_URL | ESC_FILE;
// RFC3986 allows unescaped only: ALPHA, DIGIT and "-._~:/?#[]@!$&'()*+,;="
// so include also folowing chars for url escaping...
dc = b_cast("\x5C\x5E\x60\x7C\x7F");
for (c = (REBYTE)LEN_BYTES(dc); c > 0; c--) URL_Escapes[*dc++] = ESC_URL;
}


Expand Down
176 changes: 134 additions & 42 deletions src/mezz/sys-ports.reb
Original file line number Diff line number Diff line change
Expand Up @@ -83,52 +83,144 @@ make-port*: func [
port
]

*parse-url: make object! [
digit: make bitset! "0123456789"
digits: [1 5 digit]
alpha-num: make bitset! [#"a" - #"z" #"A" - #"Z" #"0" - #"9"]
scheme-char: insert copy alpha-num "+-."
path-char: complement make bitset! "#"
user-char: complement make bitset! ":@"
host-char: complement make bitset! ":/?"
s1: s2: none ; in R3, input datatype is preserved - these are now URL strings!
out: []
emit: func ['w v] [reduce/into [to set-word! w if :v [to string! :v]] tail out]

rules: [
; Scheme://user-host-part
[
; scheme name: [//]
copy s1 some scheme-char ":" opt "//" ; we allow it
(reduce/into [to set-word! 'scheme to lit-word! to string! s1] tail out)

; optional user [:pass]
opt [
copy s1 some user-char
opt [#":" copy s2 to #"@" (emit pass s2)]
#"@" (emit user s1)
]
url-parser: make object! [
;; Source of this url-parser is inspired by Gregg Irwin's code:
;; https://gist.github.com/greggirwin/207149d46441cd48a1426e60926a7d25
;; which is now used in Red:
;; https://github.com/red/red/blob/f619641b573621ee4c0ca7e0a8b706053db53a36/environment/networking.red#L34-L209
;; Output of this version is different than in Red!

out: make block! 14
value: none

;-- Basic Character Sets
digit: system/catalog/bitsets/numeric
alpha: system/catalog/bitsets/alpha
alpha-num: system/catalog/bitsets/alpha-numeric
hex-digit: system/catalog/bitsets/hex-digits

;-- URL Character Sets
;URIs include components and subcomponents that are delimited by characters in the "reserved" set.
gen-delims: #[bitset! #{000000001001002180000014}] ;= charset ":/?#[]@"
sub-delims: #[bitset! #{000000004BF80014}] ;= charset "!$&'()*+,;="
reserved: #[bitset! #{000000005BF9003580000014}] ;= [gen-delims | sub-delims]
;The purpose of reserved characters is to provide a set of delimiting
;characters that are distinguishable from other data within a URI.

;Characters that are allowed in a URI but do not have a reserved purpose are "unreserved"
unreserved: #[bitset! #{000000000006FFC07FFFFFE17FFFFFE2}] ;= compose [alpha | digit | (charset "-._~")]
scheme-char: #[bitset! #{000000000016FFC07FFFFFE07FFFFFE0}] ;= union alpha-num "+-."

;-- URL Grammar
url-rules: [
scheme-part
hier-part (
if all [value not empty? value][
case [
out/scheme = 'mailto [
emit target to string! dehex :value
]

; optional host [:port]
opt [
copy s1 any host-char
opt [#":" copy s2 digits (compose/into [port: (to integer! s2)] tail out)]
(unless empty? s1 [attempt [s1: to tuple! s1] emit host s1])
all [out/scheme = 'urn parse value [
; case like: urn:example:animal:ferret:nose (#":" is not a valid file char)
; https://datatracker.ietf.org/doc/html/rfc2141
copy value to #":" (
emit path to string! dehex value ;= Namespace Identifier
)
1 skip
copy value to end (
emit target to string! dehex value ;= Namespace Specific String
)
]] true

'else [
value: to file! dehex :value
either dir? value [
emit path value
][
value: split-path value
if %./ <> value/1 [emit path value/1]
emit target value/2
]
]
]
]
)
opt query
opt fragment
]
scheme-part: [copy value [alpha any scheme-char] #":" (emit scheme to lit-word! lowercase to string! :value)]
hier-part: [#"/" #"/" authority path-abempty | path-absolute | path-rootless | path-empty]

; The authority component is preceded by a double slash ("//") and is
; terminated by the next slash ("/"), question mark ("?"), or number
; sign ("#") character, or by the end of the URI.
authority: [opt user host opt [#":" port]]
user: [
copy value [any [unreserved | pct-encoded | sub-delims | #":"] #"@"]
(
take/last value
value: to string! dehex value
parse value [
copy value to #":" (emit user value)
1 skip
copy value to end ( emit pass value)
|
(emit user value)
]
)
]
host: [
ip-literal (emit host to string! dehex :value)
|
copy value any [unreserved | pct-encoded | sub-delims]
(unless empty? value [emit host to string! dehex :value])
]
ip-literal: [copy value [[#"[" thru #"]"] | ["%5B" thru "%5D"]]] ; simplified from [IPv6address | IPvFuture]
port: [copy value [1 5 digit] (emit port to integer! to string! :value)]
pct-encoded: [#"%" 2 hex-digit]
pchar: [unreserved | pct-encoded | sub-delims | #":" | #"@"] ; path characters
path-abempty: [copy value any-segments | path-empty]
path-absolute: [copy value [#"/" opt [segment-nz any-segments]]]
path-rootless: [copy value [segment-nz any-segments]]
path-empty: [none]
segment: [any pchar]
segment-nz: [some pchar]
segment-nz-nc: [some [unreserved | pct-encoded | sub-delims | #"@"]] ; non-zero-length segment with no colon
any-segments: [any [#"/" segment]]
query: [#"?" copy value any [pchar | slash | #"?"] (emit query to string! dehex :value)]
fragment: [#"#" copy value any [pchar | slash | #"?"] (emit fragment to string! dehex :value)]

; Helper function
emit: func ['w v] [reduce/into [to set-word! w :v] tail out]


;-- Parse Function
parse-url: function [
"Return object with URL components, or cause an error if not a valid URL"
url [url! string!]
][
;@@ MOLD of the url! preserves (and also adds) the percent encoding.
;@@ binary! is used to have `dehex` on results decode UTF8 chars correctly
;@@ see: https://github.com/Oldes/Rebol-issues/issues/1986
result: either parse to binary! mold as url! url url-rules [
copy out
][
none
]

; optional path
opt [copy s1 some path-char (emit path s1)]

; optional bookmark
opt [#"#" copy s1 to end (emit tag s1)]
; cleanup (so there are no remains visible in the url-parser object)
clear out
set 'value none
; done
result
]

decode-url: func ["Decode a URL according to rules of sys/*parse-url." url] [
--- "This function is bound in the context of sys/*parse-url."
out: make block! 8
parse/all url rules
out
; Exported function (Rebol compatible name)
set 'decode-url function [
"Decode a URL into an object containing its constituent parts"
url [url! string!]
][
parse-url url
]
]

Expand Down Expand Up @@ -181,7 +273,7 @@ init-schemes: func [
][
log/debug 'REBOL "Init schemes"

sys/decode-url: lib/decode-url: :sys/*parse-url/decode-url
sys/decode-url: lib/decode-url: :sys/url-parser/parse-url

system/schemes: make object! 11

Expand Down
28 changes: 0 additions & 28 deletions src/tests/units/port-test.r3
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,6 @@ Rebol [

~~~start-file~~~ "port"

===start-group=== "decode-url"
;@@ https://github.com/Oldes/Rebol-issues/issues/2380
--test-- "decode-url-unicode"
url: decode-url http://example.com/get?q=ščř#kovtička
--assert url/scheme = 'http
--assert url/host = "example.com"
--assert url/path = "/get?q=ščř"
--assert url/tag = "kovtička"
--test-- "decode-url-unicode"
url: decode-url http://švéd:břéť[email protected]:8080/get?q=ščř#kovtička
--assert url/scheme = 'http
--assert url/user = "švéd"
--assert url/pass = "břéťa"
--assert url/host = "example.com"
--assert url/port = 8080
--assert url/path = "/get?q=ščř"
--assert url/tag = "kovtička"
--test-- "decode-url http://host?query"
url: decode-url http://host?query
--assert url/host = "host"
--assert url/path = "?query"
--test-- "decode-url tcp://:9000"
;@@ https://github.com/Oldes/Rebol-issues/issues/1275
url: decode-url tcp://:9000
--assert url/scheme = 'tcp
--assert url/port = 9000

===end-group===

===start-group=== "directory port"
;@@ https://github.com/Oldes/Rebol-issues/issues/2320
Expand Down
Loading

1 comment on commit 7bf61fa

@Oldes
Copy link
Owner Author

@Oldes Oldes commented on 7bf61fa Apr 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit affects also Oldes/Rebol-issues#2406

Please sign in to comment.