forked from pqwy/notty
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use internal Unicode property database instead. - Con: An 84K larger Notty. - Pro: 10M of deps less.
- Loading branch information
Showing
12 changed files
with
724 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,5 +9,6 @@ tmp | |
\.\#* | ||
\#*# | ||
|
||
*.json | ||
gmon.out | ||
rondom |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Cannibalized bits of Uucp: | ||
|
||
- `Notty_uucp_data` is generated from an actual Uucp installation. | ||
- `Notty_uucp` uses it to provide the few Unicode properties that Notty needs. | ||
- `Notty_grapheme_cluster` is `Grapheme_cluster` from Uuseg, adapted to use the | ||
above. | ||
|
||
Compiled size of these is on the order of 70K. Uucp is presently a monolithic 10M. | ||
|
||
The idea is to remove these in favor of the actual Uucp/Uuseg, as soon as it | ||
becomes possible to depend only on the necessary parts of Uucp. | ||
|
||
Uucp and Uuseg are Copyright (c) 2014 Daniel C. Bünzli. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
(*--------------------------------------------------------------------------- | ||
Copyright (c) 2014 Daniel C. Bünzli. All rights reserved. | ||
Distributed under the ISC license, see terms at the end of the file. | ||
%%NAME%% %%VERSION%% | ||
---------------------------------------------------------------------------*) | ||
|
||
(* These are the rules as found in [1], with property values aliases [2] | ||
substituted. | ||
GB1. sot ÷ Any | ||
GB2. Any ÷ eot | ||
GB3. CR × LF | ||
GB4. (CN|CR|LF) ÷ | ||
GB5. ÷ (CN|CR|LF) | ||
GB6. L × (L|V|LV|LVT) | ||
GB7. (LV|V) × (V|T) | ||
GB8. (LVT|T) × T | ||
GB9. × (EX|ZWJ) | ||
GB9a. × SM | ||
GB9b. PP × | ||
GB10. (v10.0.0) (EB|EBG) EX* × EM | ||
GB11. (v10.0.0) ZWJ × (GAZ|EBG) | ||
GB12. sot (RI RI)* RI × RI | ||
GB13. [^RI] (RI RI)* × RI | ||
GB999. Any ÷ Any | ||
[1]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries | ||
[2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt | ||
[3]: http://www.unicode.org/Public/7.0.0/ucd/auxiliary/GraphemeBreakTest.html | ||
By the structure of the rules we see that grapheme clusters | ||
boundaries can *mostly* be determined by simply looking at the | ||
grapheme cluster break property value of the character on the left | ||
and on the right of a boundary. The exceptions are GB10 and GB12-13 | ||
which are handled specially by enriching the segmenter state in | ||
a horribly ad-hoc fashion. *) | ||
|
||
type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ] | ||
|
||
type gcb = | ||
| CN | CR | EX | EB | EBG | EM | GAZ | L | LF | LV | LVT | PP | RI | ||
| SM | T | V | XX | ZWJ | Sot | ||
|
||
(* WARNING. The indexes used here need to be synchronized with those | ||
assigned by uucp for Uucp.Break.Low.grapheme_cluster. *) | ||
|
||
let byte_to_gcb = | ||
[| CN; CR; EX; EB; EBG; EM; GAZ; L; LF; LV; LVT; PP; RI; | ||
SM; T; V; XX; ZWJ; |] | ||
|
||
let gcb u = byte_to_gcb.(Notty_uucp.grapheme_cluster_boundary u) | ||
|
||
type state = | ||
| Fill (* get next uchar to decide boundary. *) | ||
| Flush (* an uchar is buffered, client needs to get it out with `Await. *) | ||
| End (* `End was added. *) | ||
|
||
type t = | ||
{ mutable state : state; (* current state. *) | ||
mutable left : gcb; (* break property value left of boundary. *) | ||
mutable odd_ri : bool; (* odd number of RI on the left. *) | ||
mutable emoji_seq : bool; (* (EB|EBG) Extend* on the left. *) | ||
mutable buf : [ `Uchar of Uchar.t ] } (* bufferized add. *) | ||
|
||
let nul_buf = `Uchar (Uchar.unsafe_of_int 0x0000) | ||
|
||
let create () = | ||
{ state = Fill; left = Sot; | ||
odd_ri = false; emoji_seq = false; | ||
buf = nul_buf (* overwritten *); } | ||
|
||
let break s right = match s.left, right with | ||
| (* GB1 *) Sot, _ -> true | ||
(* GB2 is handled by `End *) | ||
| (* GB3 *) CR, LF -> false | ||
| (* GB4 *) (CN|CR|LF), _ -> true | ||
| (* GB5 *) _, (CN|CR|LF) -> true | ||
| (* GB6 *) L, (L|V|LV|LVT) -> false | ||
| (* GB7 *) (LV|V), (V|T) -> false | ||
| (* GB8 *) (LVT|T), T -> false | ||
| (* GB9+a *) _, (EX|ZWJ|SM) -> false | ||
| (* GB9b *) PP, _ -> false | ||
| (* GB10 *) _, EM when s.emoji_seq -> false | ||
| (* GB11 *) ZWJ, (GAZ|EBG) -> false | ||
| (* GB12+13 *) RI, RI when s.odd_ri -> false | ||
| (* GB999 *) _, _ -> true | ||
|
||
let update_left s right = | ||
s.left <- right; | ||
match s.left with | ||
| EX -> (* keep s.emoji_seq as is *) s.odd_ri <- false | ||
| EB | EBG -> s.emoji_seq <- true; s.odd_ri <- false | ||
| RI -> s.emoji_seq <- false; s.odd_ri <- not s.odd_ri | ||
| _ -> s.emoji_seq <- false; s.odd_ri <- false | ||
|
||
let add s = function | ||
| `Uchar u as add -> | ||
begin match s.state with | ||
| Fill -> | ||
let right = gcb u in | ||
let break = break s right in | ||
update_left s right; | ||
if not break then add else | ||
(s.state <- Flush; s.buf <- add; `Boundary) | ||
| Flush | End -> assert false | ||
end | ||
| `Await -> | ||
begin match s.state with | ||
| Flush -> s.state <- Fill; (s.buf :> ret) | ||
| End -> `End | ||
| Fill -> `Await | ||
end | ||
| `End -> | ||
begin match s.state with | ||
| Fill -> s.state <- End; if s.left = Sot then `End else `Boundary | ||
| Flush | End -> assert false | ||
end | ||
|
||
(*--------------------------------------------------------------------------- | ||
Copyright (c) 2014 Daniel C. Bünzli | ||
Permission to use, copy, modify, and/or distribute this software for any | ||
purpose with or without fee is hereby granted, provided that the above | ||
copyright notice and this permission notice appear in all copies. | ||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
---------------------------------------------------------------------------*) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
(*--------------------------------------------------------------------------- | ||
Copyright (c) 2014 Daniel C. Bünzli. All rights reserved. | ||
Distributed under the ISC license, see terms at the end of the file. | ||
%%NAME%% %%VERSION%% | ||
---------------------------------------------------------------------------*) | ||
|
||
type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ] | ||
|
||
type t | ||
val create : unit -> t | ||
val add : t -> [ `Await | `End | `Uchar of Uchar.t ] -> ret | ||
|
||
(*--------------------------------------------------------------------------- | ||
Copyright (c) 2014 Daniel C. Bünzli | ||
Permission to use, copy, modify, and/or distribute this software for any | ||
purpose with or without fee is hereby granted, provided that the above | ||
copyright notice and this permission notice appear in all copies. | ||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
---------------------------------------------------------------------------*) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
(* Copyright (c) 2020 David Kaloper Meršinjak. All rights reserved. | ||
See LICENSE.md. *) | ||
|
||
(* Unpacked interval lookup table. *) | ||
let find_i ~def k (xs, _, _ as tab) = | ||
let rec go i j (los, his, vs as tab) (k: int) def = | ||
if i > j then def else | ||
let x = (i + j) / 2 in | ||
if k < Array.unsafe_get los x then go i (x - 1) tab k def else | ||
if k > Array.unsafe_get his x then go (x + 1) j tab k def else | ||
Array.unsafe_get vs x in | ||
go 0 (Array.length xs - 1) tab k def | ||
|
||
(* Fixed-depth 8-8-8-bit trie; root is variable, levels 2,3 are either empty | ||
or full. *) | ||
let find_t ~def k tab = | ||
let k = if k > 0xd7ff then k - 0x800 else k in | ||
let b0 = (k lsr 16) land 0xff in | ||
if Array.length tab <= b0 then def else | ||
match Array.unsafe_get tab b0 with | ||
| [||] -> def | ||
| arr -> match Array.unsafe_get arr ((k lsr 8) land 0xff) with | ||
| "" -> def | ||
| str -> String.unsafe_get str (k land 0xff) |> Char.code | ||
|
||
(* We catch w = -1 and default to w = 1 to minimize the table. *) | ||
let tty_width_hint u = match Uchar.to_int u with | ||
| 0 -> 0 | ||
| u when u <= 0x001F || 0x007F <= u && u <= 0x009F -> -1 | ||
| u when u <= 0x02ff -> 1 | ||
| u -> find_i ~def:1 u Notty_uucp_data.tty_width_hint | ||
|
||
let grapheme_cluster_boundary u = | ||
find_t ~def:16 (Uchar.to_int u) Notty_uucp_data.grapheme_cluster_boundary | ||
|
||
(* let check () = *) | ||
(* let pp_u ppf u = Format.fprintf ppf "u+%04x" (Uchar.to_int u) in *) | ||
(* let rec go i u = *) | ||
(* let w1 = tty_width_hint u *) | ||
(* and w2 = Uucp.Break.tty_width_hint u in *) | ||
(* if w1 <> w2 then Format.printf "w: %a here: %d there: %d@." pp_u u w1 w2; *) | ||
(* let gc1 = grapheme_cluster_boundary u *) | ||
(* and gc2 = Uucp.Break.Low.grapheme_cluster u in *) | ||
(* if gc1 <> gc2 then Format.printf "gc: %a here: %d there: %d@." pp_u u gc1 gc2; *) | ||
(* if u = Uchar.max then i else go (i + 1) (Uchar.succ u) in *) | ||
(* let n = go 1 Uchar.min in *) | ||
(* Format.printf "Checked equality for %d code points.@." n *) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
(* Copyright (c) 2020 David Kaloper Meršinjak. All rights reserved. | ||
See LICENSE.md. *) | ||
|
||
(* This is a local copy of the (very few) relevant [uucp] properties. *) | ||
|
||
val tty_width_hint : Uchar.t -> int | ||
(* [Uucp.Break.tty_width_hint]. *) | ||
|
||
val grapheme_cluster_boundary : Uchar.t -> int | ||
(* [Uucp.Break.Low.grapheme_cluster]. *) | ||
|
||
(* val check : unit -> unit *) | ||
|
Oops, something went wrong.